Created
April 5, 2025 11:58
-
-
Save rndblnch/2b496948204c5453c45a74c2518ec463 to your computer and use it in GitHub Desktop.
scrapper for noaa data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| # imports ################################################################### | |
| import os | |
| import os.path | |
| from collections import deque | |
| from urllib.request import urlopen, Request | |
| from bs4 import BeautifulSoup as BS | |
| # data ###################################################################### | |
| URL = "https://www.ncei.noaa.gov/" | |
| PATHS = deque(['data/noaa-global-surface-temperature/v6/']) | |
| def query(href='', decode=True): | |
| req = Request( | |
| URL + href, | |
| headers={ | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' | |
| } | |
| ) | |
| resp = urlopen(req) | |
| assert resp.status == 200, "server responded %s" % resp.status | |
| raw = resp.read() | |
| if not decode: | |
| return raw | |
| content_type = resp.getheader('content-type').lower() | |
| if content_type == 'text/plain': | |
| encoding = 'ascii' | |
| else: | |
| _, charset = content_type.split(";") | |
| charset = charset.strip() | |
| assert charset.startswith("charset=") | |
| _, encoding = charset.split("=") | |
| # parsing response | |
| return raw.decode(encoding) | |
| while PATHS: | |
| path = PATHS.pop() | |
| soup = BS(query(path), features="html.parser") | |
| for a in soup.find_all('a'): | |
| href = a.get('href') | |
| if href[0] in '?/': | |
| continue | |
| todo = path+href | |
| exists = os.path.exists(todo) | |
| if todo.endswith('/'): | |
| PATHS.appendleft(todo) | |
| if not exists: | |
| print('creating', todo) | |
| os.mkdir(todo) | |
| elif not exists: | |
| print('downloading', todo) | |
| with open(todo, 'wb') as output: | |
| output.write(query(todo, False)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment