|
from semanticscholar import SemanticScholar |
|
from collections import defaultdict |
|
from tqdm import tqdm |
|
import os |
|
|
|
# Read key from env instead of hard-coding |
|
#s2_api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY") |
|
sch = SemanticScholar(timeout=10) |
|
# alec |
|
author_ids = [2242015445,145151177,2251097727,2241538537,2244829783,2199254215,2309006642,2275054179,2312381751,2242589355] |
|
# If author_ids is a single int, wrap it in a list |
|
if isinstance(author_ids, int): |
|
author_ids = [author_ids] |
|
|
|
# Batch fetch all authors in one call |
|
authors = sch.get_authors([str(aid) for aid in author_ids]) |
|
|
|
papers = [] |
|
names = [] |
|
|
|
for author in authors: |
|
names.append(author.name) |
|
if len(author.papers) < author.paperCount: |
|
print( |
|
f"Warning: {author.name} has {author.paperCount} papers, " |
|
f"but only {len(author.papers)} were retrieved" |
|
) |
|
papers.extend(author.papers) |
|
|
|
names = ", ".join(names) |
|
|
|
papers_by_id = {paper["paperId"]: paper for paper in papers} |
|
|
|
# --- Batch fetch full paper records for these paper IDs --- |
|
BATCH_SIZE = 1000 # API allows up to 1000 IDs per call |
|
|
|
for i in range(0, len(paper_ids), BATCH_SIZE): |
|
batch_ids = paper_ids[i : i + BATCH_SIZE] |
|
|
|
# Ask specifically for references, title, year to minimize payload |
|
batch_papers = sch.get_papers( |
|
batch_ids, |
|
fields=["title", "year", "references.title", "references.year"], |
|
) |
|
|
|
for authors_paper_full_rec in batch_papers: |
|
# authors_paper_full_rec is a Paper object |
|
|
|
# Get its references (may be list of dicts or objects depending on version) |
|
references = getattr(authors_paper_full_rec, "references", []) or [] |
|
|
|
for ref in references: |
|
# Handle both dict-style and object-style references |
|
if isinstance(ref, dict): |
|
ref_data = ref |
|
else: |
|
ref_data = getattr(ref, "raw_data", {}) or {} |
|
|
|
ref_id = ref_data.get("paperId") |
|
if not ref_id: |
|
continue |
|
|
|
paperId_counts[ref_id] += 1 |
|
# Store the short record so we can print title/year later |
|
paperId_short_recs[ref_id] = ref_data |
|
|
|
# --- Sort and print results --- |
|
paperId_counts_sorted = sorted( |
|
paperId_counts.items(), key=lambda item: -item[1] |
|
) |
|
|
|
for paperId, count in paperId_counts_sorted: |
|
rec = paperId_short_recs.get(paperId, {}) |
|
title = rec.get("title", "UNKNOWN TITLE") |
|
year = rec.get("year", "UNKNOWN YEAR") |
|
print(f'{count}\t"{title}," {year}') |