Skip to content

Instantly share code, notes, and snippets.

@lmassaron
Last active September 27, 2025 08:42
Show Gist options
  • Select an option

  • Save lmassaron/6af8d535f25c72c93473c972b8afb592 to your computer and use it in GitHub Desktop.

Select an option

Save lmassaron/6af8d535f25c72c93473c972b8afb592 to your computer and use it in GitHub Desktop.
Token analysis
# %%
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
# %%
model_name = "google/gemma-3-1b-it"
# Define maximum sequence length for the tokenizer
max_seq_length = 1024
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
max_seq_length=max_seq_length,
device_map="auto"
)
# Store the EOS token for later use
EOS_TOKEN = tokenizer.eos_token
# %%
filename = "https://github.com/lmassaron/Gemma-3-1B-financial-sentiment-analysis/raw/refs/heads/main/all-data.csv"
df = pd.read_csv(filename,
names=["sentiment", "text"],
encoding="utf-8", encoding_errors="replace")
# %%
df.head()
# %%
# Function to generate training and evaluation prompts
def generate_train_prompt(data_point):
return f"""
Analyze the sentiment of the news headline enclosed in square brackets.
Determine if it is positive, neutral, or negative, and return the corresponding sentiment label:
"positive", "neutral", or "negative".
[{data_point["text"]}] = {data_point["sentiment"]}
""".strip() + EOS_TOKEN
# I apply the prompt
prompts = pd.DataFrame(df.apply(generate_train_prompt, axis=1), columns=["text"])
# %%
def analyze_token_lengths(texts, tokenizer):
"""Tokenizes each item in a dataset and returns a list of token lengths."""
token_lengths = []
for prompt_text in tqdm(texts, desc="Tokenizing samples"):
num_tokens = len(tokenizer(prompt_text).input_ids)
token_lengths.append(num_tokens)
return token_lengths
# %%
all_token_lengths = analyze_token_lengths(prompts["text"], tokenizer)
# %%
print(f"Total tokens: {sum(all_token_lengths)}")
print(f"Max tokens per prompt: {max(all_token_lengths)}")
print(f"Average tokens per prompt: {sum(all_token_lengths) / len(all_token_lengths):.2f}")
# %%
plt.style.use("seaborn-v0_8-whitegrid")
fig, ax = plt.subplots(figsize=(12, 7))
max_overall_length = max(np.max(lengths) for lengths in all_token_lengths)
bins = np.linspace(0, max_overall_length, 100) # 100 bins up to the max length
ax.hist(
all_token_lengths,
bins=bins,
alpha=0.6, # Use alpha for transparency to see overlapping distributions
label=f"Prompts (n={len(all_token_lengths)})",
)
ax.set_title("Distribution of Prompt Token Lengths", fontsize=16)
ax.set_xlabel("Token Length", fontsize=12)
ax.set_ylabel("Number of Samples", fontsize=12)
ax.legend(fontsize=12)
ax.set_xlim(0, max_overall_length) # Set x-axis limit
plt.show()
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment