Last active
December 6, 2025 12:09
-
-
Save mdmitry1/182d4e49263692855424f161eed65318 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python3.14 | |
| import arxiv,sh | |
| from json import load, loads, dump, dumps, JSONDecodeError | |
| from sys import argv | |
| from os import makedirs, listdir, getcwd | |
| from os.path import join, isdir, isfile, realpath, dirname | |
| from typing import List, Tuple | |
| from pydantic import RootModel | |
| from operator import itemgetter | |
| from rich import print as rprint | |
| class DataTuple(RootModel[Tuple[str, str, str]]): | |
| """ | |
| A Pydantic model for a tuple containing three strings | |
| The root value is accessed via the `root` attribute. | |
| """ | |
| pass | |
| def search_papers(topic: str, max_results: int = 10, PAPER_DIR: str = ".") -> List[DataTuple]: | |
| """ | |
| Search for papers on arXiv based on a topic and store their information. | |
| Args: | |
| topic: The topic to search for | |
| max_results: Maximum number of results to retrieve (default: 5) | |
| Returns: | |
| List of paper IDs found in the search | |
| """ | |
| # Use arxiv to find the papers | |
| client = arxiv.Client() | |
| # Search for the most relevant articles matching the queried topic | |
| search = arxiv.Search( | |
| query = topic, | |
| max_results = max_results, | |
| sort_by = arxiv.SortCriterion.Relevance | |
| ) | |
| papers = client.results(search) | |
| # Create directory for this topic | |
| path = join(PAPER_DIR, topic.lower().replace(" ", "_")) | |
| makedirs(path, exist_ok=True) | |
| file_path = join(path, "papers_info.json") | |
| # Try to load existing papers info | |
| try: | |
| with open(file_path, "r") as json_file: | |
| papers_info = load(json_file) | |
| except (FileNotFoundError, JSONDecodeError): | |
| papers_info = {} | |
| # Process each paper and add to papers_info | |
| paper_headers = [] | |
| for paper in papers: | |
| paper_id = paper.get_short_id() | |
| published = str(paper.published.date()) | |
| raw_data=(paper_id, paper.title, published) | |
| paper_headers.append(DataTuple.model_validate(raw_data)) | |
| paper_info = { | |
| 'title': paper.title, | |
| 'authors': [author.name for author in paper.authors], | |
| 'summary': paper.summary, | |
| 'pdf_url': paper.pdf_url, | |
| 'published': published | |
| } | |
| papers_info[paper_id] = paper_info | |
| # Save updated papers_info to json file | |
| with open(file_path, "w") as json_file: | |
| dump(papers_info, json_file, indent=2) | |
| print(f"Results are saved in: {file_path}") | |
| return paper_headers | |
| def extract_info(paper_id: str, PAPER_DIR: str = ".") -> str: | |
| """ | |
| Search for information about a specific paper across all topic directories. | |
| Args: | |
| paper_id: The ID of the paper to look for | |
| Returns: | |
| JSON string with paper information if found, error message if not found | |
| """ | |
| for item in listdir(PAPER_DIR): | |
| item_path = join(PAPER_DIR, item) | |
| if isdir(item_path): | |
| file_path = join(item_path, "papers_info.json") | |
| if isfile(file_path): | |
| try: | |
| with open(file_path, "r") as json_file: | |
| papers_info = load(json_file) | |
| if paper_id in papers_info: | |
| return dumps(papers_info[paper_id], indent=2) | |
| except (FileNotFoundError, JSONDecodeError) as e: | |
| print(f"Error reading {file_path}: {str(e)}") | |
| continue | |
| return f"There's no saved information related to paper {paper_id}." | |
| if __name__ == "__main__": | |
| if len(argv) < 2: | |
| print("\nUsage: " + dirname(realpath(argv[0])) + " '<topic>' [<id>]\n") | |
| exit(0) | |
| PAPER_DIR=getcwd() | |
| paper_headers=search_papers(argv[1], 20, PAPER_DIR) | |
| header_list=[] | |
| for header in paper_headers: | |
| paper_id, paper_title, published = header.root | |
| header_list.append((paper_id, paper_title, published)) | |
| header_list.sort(key=lambda x: x[2], reverse=True) | |
| [rprint(row[0],row[1],row[2]) for row in header_list[:10]] | |
| if len(argv) > 2: | |
| info=loads(extract_info(argv[2])) | |
| info_pretty=dumps(info,indent=4) | |
| print(info_pretty) | |
| pdf=argv[2] + ".pdf" | |
| sh.wget(info["pdf_url"],"-O", pdf) | |
| rprint(sh.ls('-ls',pdf).strip()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment