Last active
February 13, 2026 13:27
-
-
Save lmmx/7e7182753383a7e2b318543fc63248d6 to your computer and use it in GitHub Desktop.
Token classifier demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from transformers import AutoModel, AutoTokenizer | |
| def main(): | |
| model_name = "HavelockAI/bert-token-classifier" | |
| # Load tokenizer + model | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name, trust_remote_code=True) | |
| model.eval() | |
| # Load type map | |
| type_map_path = hf_hub_download(model_name, "type_to_idx.json") | |
| with open(type_map_path) as f: | |
| type_to_idx = json.load(f) | |
| idx_to_type = {v: k for k, v in type_to_idx.items()} | |
| # The three academic paragraphs | |
| text = ( | |
| "If one were to assume that the manuscript under consideration, " | |
| "which might plausibly be interpreted as reflecting certain late-structuralist " | |
| "commitments, were situated within a broader interdisciplinary discourse, " | |
| "then it could be argued that its methodological ambiguities would likely " | |
| "illuminate how epistemic hedging functions across overlapping analytical frameworks.\n\n" | |
| "In particular, if the reader were to encounter a claim that, insofar as it " | |
| "may tentatively be understood as provisionally substantiated, appears to " | |
| "challenge prevailing orthodoxies, then that claim would seem to exemplify " | |
| "the manner in which layered qualifications can simultaneously signal caution, " | |
| "reflexivity, and theoretical openness.\n\n" | |
| "Thus, if it were the case that the argument, which might reasonably be said " | |
| "to be only partially determinate, were evaluated according to conventional " | |
| "standards of evidentiary rigor, one might conclude that its apparent " | |
| "indeterminacy would, under certain interpretive conditions, constitute not " | |
| "a weakness but a productive site of conceptual negotiation." | |
| ) | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| logits = model(**inputs) | |
| # Use Viterbi decoding if CRF is available, else argmax | |
| if hasattr(model, "decode"): | |
| preds = model.decode(inputs["input_ids"], inputs["attention_mask"]) | |
| else: | |
| preds = logits.argmax(dim=-1) # (1, seq, num_types) | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
| print(f"Logits shape: {tuple(logits.shape)}") | |
| print() | |
| for i, token in enumerate(tokens): | |
| active = [ | |
| f"{idx_to_type[t]}={'OBI'[v]}" | |
| for t, v in enumerate(preds[0, i].tolist()) | |
| if v > 0 | |
| ] | |
| if active: | |
| print(f"{token:15} {', '.join(active)}") | |
| # Simple runtime checks (not pytest) | |
| assert logits.shape == ( | |
| 1, | |
| inputs["input_ids"].shape[1], | |
| len(type_to_idx), | |
| 3, | |
| ), "Unexpected logits shape" | |
| assert (preds > 0).any(), "Model predicted all-O for every type" | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from transformers import AutoModel, AutoTokenizer | |
| def main(): | |
| model_name = "HavelockAI/bert-token-classifier" | |
| # Load tokenizer + model | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name, trust_remote_code=True) | |
| model.eval() | |
| # Load type map | |
| type_map_path = hf_hub_download(model_name, "type_to_idx.json") | |
| with open(type_map_path) as f: | |
| type_to_idx = json.load(f) | |
| idx_to_type = {v: k for k, v in type_to_idx.items()} | |
| # The three podcast-style paragraphs | |
| text = ( | |
| "So imagine you're reading this manuscript and you're thinking, okay, " | |
| "this definitely has some late-structuralist vibes going on. If you zoom out " | |
| "and place it in a bigger interdisciplinary conversation, you start to see " | |
| "that all those methodological gray areas aren't just accidental. They’re " | |
| "actually showing you how people hedge their claims differently depending " | |
| "on which analytical lens they're using.\n\n" | |
| "Now picture a reader coming across a claim that kind of pushes against " | |
| "the usual orthodoxy. It’s framed cautiously — lots of 'maybe,' 'arguably,' " | |
| "and 'to some extent.' That layering of qualifications isn’t just academic " | |
| "nervousness. It’s doing work. It signals caution, sure, but it also shows " | |
| "self-awareness and leaves the door open for alternative interpretations.\n\n" | |
| "And here’s the interesting part: if you judged the whole argument by super " | |
| "strict, traditional standards of evidence, you might initially think the " | |
| "fuzziness is a flaw. But under the right interpretive lens, that " | |
| "indeterminacy becomes productive. It turns into a space where meaning is " | |
| "negotiated rather than declared — and that’s kind of the point." | |
| ) | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| logits = model(**inputs) | |
| # Use Viterbi decoding if CRF is available, else argmax | |
| if hasattr(model, "decode"): | |
| preds = model.decode(inputs["input_ids"], inputs["attention_mask"]) | |
| else: | |
| preds = logits.argmax(dim=-1) # (1, seq, num_types) | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
| print(f"Logits shape: {tuple(logits.shape)}") | |
| print() | |
| for i, token in enumerate(tokens): | |
| active = [ | |
| f"{idx_to_type[t]}={'OBI'[v]}" | |
| for t, v in enumerate(preds[0, i].tolist()) | |
| if v > 0 | |
| ] | |
| if active: | |
| print(f"{token:15} {', '.join(active)}") | |
| # Simple runtime checks (not pytest) | |
| assert logits.shape == ( | |
| 1, | |
| inputs["input_ids"].shape[1], | |
| len(type_to_idx), | |
| 3, | |
| ), "Unexpected logits shape" | |
| assert (preds > 0).any(), "Model predicted all-O for every type" | |
| if __name__ == "__main__": | |
| main() |
Author
lmmx
commented
Feb 13, 2026
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment