Last active
September 27, 2025 08:42
-
-
Save lmassaron/6af8d535f25c72c93473c972b8afb592 to your computer and use it in GitHub Desktop.
Token analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # %% | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from tqdm import tqdm | |
| from transformers import AutoTokenizer | |
| # %% | |
| model_name = "google/gemma-3-1b-it" | |
| # Define maximum sequence length for the tokenizer | |
| max_seq_length = 1024 | |
| # Load the tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| max_seq_length=max_seq_length, | |
| device_map="auto" | |
| ) | |
| # Store the EOS token for later use | |
| EOS_TOKEN = tokenizer.eos_token | |
| # %% | |
| filename = "https://github.com/lmassaron/Gemma-3-1B-financial-sentiment-analysis/raw/refs/heads/main/all-data.csv" | |
| df = pd.read_csv(filename, | |
| names=["sentiment", "text"], | |
| encoding="utf-8", encoding_errors="replace") | |
| # %% | |
| df.head() | |
| # %% | |
| # Function to generate training and evaluation prompts | |
| def generate_train_prompt(data_point): | |
| return f""" | |
| Analyze the sentiment of the news headline enclosed in square brackets. | |
| Determine if it is positive, neutral, or negative, and return the corresponding sentiment label: | |
| "positive", "neutral", or "negative". | |
| [{data_point["text"]}] = {data_point["sentiment"]} | |
| """.strip() + EOS_TOKEN | |
| # I apply the prompt | |
| prompts = pd.DataFrame(df.apply(generate_train_prompt, axis=1), columns=["text"]) | |
| # %% | |
| def analyze_token_lengths(texts, tokenizer): | |
| """Tokenizes each item in a dataset and returns a list of token lengths.""" | |
| token_lengths = [] | |
| for prompt_text in tqdm(texts, desc="Tokenizing samples"): | |
| num_tokens = len(tokenizer(prompt_text).input_ids) | |
| token_lengths.append(num_tokens) | |
| return token_lengths | |
| # %% | |
| all_token_lengths = analyze_token_lengths(prompts["text"], tokenizer) | |
| # %% | |
| print(f"Total tokens: {sum(all_token_lengths)}") | |
| print(f"Max tokens per prompt: {max(all_token_lengths)}") | |
| print(f"Average tokens per prompt: {sum(all_token_lengths) / len(all_token_lengths):.2f}") | |
| # %% | |
| plt.style.use("seaborn-v0_8-whitegrid") | |
| fig, ax = plt.subplots(figsize=(12, 7)) | |
| max_overall_length = max(np.max(lengths) for lengths in all_token_lengths) | |
| bins = np.linspace(0, max_overall_length, 100) # 100 bins up to the max length | |
| ax.hist( | |
| all_token_lengths, | |
| bins=bins, | |
| alpha=0.6, # Use alpha for transparency to see overlapping distributions | |
| label=f"Prompts (n={len(all_token_lengths)})", | |
| ) | |
| ax.set_title("Distribution of Prompt Token Lengths", fontsize=16) | |
| ax.set_xlabel("Token Length", fontsize=12) | |
| ax.set_ylabel("Number of Samples", fontsize=12) | |
| ax.legend(fontsize=12) | |
| ax.set_xlim(0, max_overall_length) # Set x-axis limit | |
| plt.show() | |
| # %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment