Skip to content

Instantly share code, notes, and snippets.

@rndblnch
Created April 5, 2025 11:58
Show Gist options
  • Select an option

  • Save rndblnch/2b496948204c5453c45a74c2518ec463 to your computer and use it in GitHub Desktop.

Select an option

Save rndblnch/2b496948204c5453c45a74c2518ec463 to your computer and use it in GitHub Desktop.
scrapper for noaa data
#! /usr/bin/env python3
# imports ###################################################################
import os
import os.path
from collections import deque
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as BS
# data ######################################################################
URL = "https://www.ncei.noaa.gov/"
PATHS = deque(['data/noaa-global-surface-temperature/v6/'])
def query(href='', decode=True):
req = Request(
URL + href,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
resp = urlopen(req)
assert resp.status == 200, "server responded %s" % resp.status
raw = resp.read()
if not decode:
return raw
content_type = resp.getheader('content-type').lower()
if content_type == 'text/plain':
encoding = 'ascii'
else:
_, charset = content_type.split(";")
charset = charset.strip()
assert charset.startswith("charset=")
_, encoding = charset.split("=")
# parsing response
return raw.decode(encoding)
while PATHS:
path = PATHS.pop()
soup = BS(query(path), features="html.parser")
for a in soup.find_all('a'):
href = a.get('href')
if href[0] in '?/':
continue
todo = path+href
exists = os.path.exists(todo)
if todo.endswith('/'):
PATHS.appendleft(todo)
if not exists:
print('creating', todo)
os.mkdir(todo)
elif not exists:
print('downloading', todo)
with open(todo, 'wb') as output:
output.write(query(todo, False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment