Skip to content

Instantly share code, notes, and snippets.

@davisv7
Created August 16, 2022 15:01
Show Gist options
  • Select an option

  • Save davisv7/c575e77b82db548468ccfebdcb34d5f8 to your computer and use it in GitHub Desktop.

Select an option

Save davisv7/c575e77b82db548468ccfebdcb34d5f8 to your computer and use it in GitHub Desktop.
Small script to scrape all posts from stacker.news
import base64
import json
from requests_cache import CachedSession
from datetime import timedelta
from itertools import count
item_session = CachedSession( # Cache POST requests to avoid sending the same data twice
'items_cache',
expire_after=timedelta(days=21), # expire responses expire after 21 days
allowable_methods=['GET', 'POST'],
)
def get_all_posts():
all_posts = []
offset_gen = count(0, 21)
while True:
offset = next(offset_gen)
cursor = base64.b64encode(json.dumps({"offset": offset, "time": "2025-01-01T00:00:00"}).encode()).decode()
json_data = { # contains GraphQL query
'operationName': 'items',
'variables': {
'cursor': cursor,
},
'query': 'fragment ItemFields on Item {\n id\n parentId\n createdAt\n title\n url\n user {\n name\n id\n __typename\n }\n fwdUser {\n name\n id\n __typename\n }\n sats\n upvotes\n boost\n path\n meSats\n ncomments\n maxBid\n company\n location\n remote\n sub {\n name\n baseCost\n __typename\n }\n pollCost\n status\n uploadId\n mine\n root {\n id\n title\n sub {\n name\n __typename\n }\n user {\n name\n id\n __typename\n }\n __typename\n }\n __typename\n}\n\nquery items($sub: String, $sort: String, $cursor: String, $name: String, $within: String) {\n items(sub: $sub, sort: $sort, cursor: $cursor, name: $name, within: $within) {\n cursor\n items {\n ...ItemFields\n position\n __typename\n }\n pins {\n ...ItemFields\n position\n __typename\n }\n __typename\n }\n}\n',
}
response = item_session.post('https://stacker.news/api/graphql', json=json_data)
items = response.json()["data"]["items"]["items"]
if len(items) == 0:
break
all_posts.extend(items)
return all_posts
all_posts = get_all_posts()
with open('posts.json', 'w') as outfile:
json.dump(all_posts, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment