Skip to content

Instantly share code, notes, and snippets.

@mdmitry1
Last active December 6, 2025 12:09
Show Gist options
  • Select an option

  • Save mdmitry1/182d4e49263692855424f161eed65318 to your computer and use it in GitHub Desktop.

Select an option

Save mdmitry1/182d4e49263692855424f161eed65318 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3.14
import arxiv,sh
from json import load, loads, dump, dumps, JSONDecodeError
from sys import argv
from os import makedirs, listdir, getcwd
from os.path import join, isdir, isfile, realpath, dirname
from typing import List, Tuple
from pydantic import RootModel
from operator import itemgetter
from rich import print as rprint
class DataTuple(RootModel[Tuple[str, str, str]]):
"""
A Pydantic model for a tuple containing three strings
The root value is accessed via the `root` attribute.
"""
pass
def search_papers(topic: str, max_results: int = 10, PAPER_DIR: str = ".") -> List[DataTuple]:
"""
Search for papers on arXiv based on a topic and store their information.
Args:
topic: The topic to search for
max_results: Maximum number of results to retrieve (default: 5)
Returns:
List of paper IDs found in the search
"""
# Use arxiv to find the papers
client = arxiv.Client()
# Search for the most relevant articles matching the queried topic
search = arxiv.Search(
query = topic,
max_results = max_results,
sort_by = arxiv.SortCriterion.Relevance
)
papers = client.results(search)
# Create directory for this topic
path = join(PAPER_DIR, topic.lower().replace(" ", "_"))
makedirs(path, exist_ok=True)
file_path = join(path, "papers_info.json")
# Try to load existing papers info
try:
with open(file_path, "r") as json_file:
papers_info = load(json_file)
except (FileNotFoundError, JSONDecodeError):
papers_info = {}
# Process each paper and add to papers_info
paper_headers = []
for paper in papers:
paper_id = paper.get_short_id()
published = str(paper.published.date())
raw_data=(paper_id, paper.title, published)
paper_headers.append(DataTuple.model_validate(raw_data))
paper_info = {
'title': paper.title,
'authors': [author.name for author in paper.authors],
'summary': paper.summary,
'pdf_url': paper.pdf_url,
'published': published
}
papers_info[paper_id] = paper_info
# Save updated papers_info to json file
with open(file_path, "w") as json_file:
dump(papers_info, json_file, indent=2)
print(f"Results are saved in: {file_path}")
return paper_headers
def extract_info(paper_id: str, PAPER_DIR: str = ".") -> str:
"""
Search for information about a specific paper across all topic directories.
Args:
paper_id: The ID of the paper to look for
Returns:
JSON string with paper information if found, error message if not found
"""
for item in listdir(PAPER_DIR):
item_path = join(PAPER_DIR, item)
if isdir(item_path):
file_path = join(item_path, "papers_info.json")
if isfile(file_path):
try:
with open(file_path, "r") as json_file:
papers_info = load(json_file)
if paper_id in papers_info:
return dumps(papers_info[paper_id], indent=2)
except (FileNotFoundError, JSONDecodeError) as e:
print(f"Error reading {file_path}: {str(e)}")
continue
return f"There's no saved information related to paper {paper_id}."
if __name__ == "__main__":
if len(argv) < 2:
print("\nUsage: " + dirname(realpath(argv[0])) + " '<topic>' [<id>]\n")
exit(0)
PAPER_DIR=getcwd()
paper_headers=search_papers(argv[1], 20, PAPER_DIR)
header_list=[]
for header in paper_headers:
paper_id, paper_title, published = header.root
header_list.append((paper_id, paper_title, published))
header_list.sort(key=lambda x: x[2], reverse=True)
[rprint(row[0],row[1],row[2]) for row in header_list[:10]]
if len(argv) > 2:
info=loads(extract_info(argv[2]))
info_pretty=dumps(info,indent=4)
print(info_pretty)
pdf=argv[2] + ".pdf"
sh.wget(info["pdf_url"],"-O", pdf)
rprint(sh.ls('-ls',pdf).strip())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment