Created
August 16, 2022 15:01
-
-
Save davisv7/c575e77b82db548468ccfebdcb34d5f8 to your computer and use it in GitHub Desktop.
Small script to scrape all posts from stacker.news
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import base64 | |
| import json | |
| from requests_cache import CachedSession | |
| from datetime import timedelta | |
| from itertools import count | |
| item_session = CachedSession( # Cache POST requests to avoid sending the same data twice | |
| 'items_cache', | |
| expire_after=timedelta(days=21), # expire responses expire after 21 days | |
| allowable_methods=['GET', 'POST'], | |
| ) | |
| def get_all_posts(): | |
| all_posts = [] | |
| offset_gen = count(0, 21) | |
| while True: | |
| offset = next(offset_gen) | |
| cursor = base64.b64encode(json.dumps({"offset": offset, "time": "2025-01-01T00:00:00"}).encode()).decode() | |
| json_data = { # contains GraphQL query | |
| 'operationName': 'items', | |
| 'variables': { | |
| 'cursor': cursor, | |
| }, | |
| 'query': 'fragment ItemFields on Item {\n id\n parentId\n createdAt\n title\n url\n user {\n name\n id\n __typename\n }\n fwdUser {\n name\n id\n __typename\n }\n sats\n upvotes\n boost\n path\n meSats\n ncomments\n maxBid\n company\n location\n remote\n sub {\n name\n baseCost\n __typename\n }\n pollCost\n status\n uploadId\n mine\n root {\n id\n title\n sub {\n name\n __typename\n }\n user {\n name\n id\n __typename\n }\n __typename\n }\n __typename\n}\n\nquery items($sub: String, $sort: String, $cursor: String, $name: String, $within: String) {\n items(sub: $sub, sort: $sort, cursor: $cursor, name: $name, within: $within) {\n cursor\n items {\n ...ItemFields\n position\n __typename\n }\n pins {\n ...ItemFields\n position\n __typename\n }\n __typename\n }\n}\n', | |
| } | |
| response = item_session.post('https://stacker.news/api/graphql', json=json_data) | |
| items = response.json()["data"]["items"]["items"] | |
| if len(items) == 0: | |
| break | |
| all_posts.extend(items) | |
| return all_posts | |
| all_posts = get_all_posts() | |
| with open('posts.json', 'w') as outfile: | |
| json.dump(all_posts, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment