rndblnch · April 5, 2025 11:58
diff --git a/download-noaa.py b/download-noaa.py
 #! /usr/bin/env python3

 # imports ###################################################################

 import os
 import os.path

 from collections import deque
 from urllib.request import urlopen, Request
 from bs4 import BeautifulSoup as BS


 # data ######################################################################

 URL = "https://www.ncei.noaa.gov/"
 PATHS = deque(['data/noaa-global-surface-temperature/v6/'])


 def query(href='', decode=True):
 	req = Request(
 	    URL + href, 
    	headers={
        	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
 	    }
 	)

 	resp = urlopen(req)
 	assert resp.status == 200, "server responded %s" % resp.status
 	raw = resp.read()

 	if not decode:
 		return raw
 	
 	content_type = resp.getheader('content-type').lower()
 	if content_type == 'text/plain':
 		encoding = 'ascii'
 	else:
 		_, charset = content_type.split(";")
 		charset = charset.strip()
 		assert charset.startswith("charset=")
 		_, encoding = charset.split("=")
 	
 	# parsing response
 	return raw.decode(encoding)

 while PATHS:
 	path = PATHS.pop()
 	soup = BS(query(path), features="html.parser")
 	for a in soup.find_all('a'):
 		href = a.get('href')
 		if href[0] in '?/':
 			continue
 		todo = path+href
 		exists = os.path.exists(todo)
 		if todo.endswith('/'):
 			PATHS.appendleft(todo)
 			if not exists:
 				print('creating', todo)
 				os.mkdir(todo)
 		elif not exists:
 			print('downloading', todo)
 			with open(todo, 'wb') as output:
 				output.write(query(todo, False))
	#! /usr/bin/env python3

	# imports ###################################################################

	import os
	import os.path

	from collections import deque
	from urllib.request import urlopen, Request
	from bs4 import BeautifulSoup as BS


	# data ######################################################################

	URL = "https://www.ncei.noaa.gov/"
	PATHS = deque(['data/noaa-global-surface-temperature/v6/'])


	def query(href='', decode=True):
	req = Request(
	URL + href,
	headers={
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
	}
	)

	resp = urlopen(req)
	assert resp.status == 200, "server responded %s" % resp.status
	raw = resp.read()

	if not decode:
	return raw

	content_type = resp.getheader('content-type').lower()
	if content_type == 'text/plain':
	encoding = 'ascii'
	else:
	_, charset = content_type.split(";")
	charset = charset.strip()
	assert charset.startswith("charset=")
	_, encoding = charset.split("=")

	# parsing response
	return raw.decode(encoding)

	while PATHS:
	path = PATHS.pop()
	soup = BS(query(path), features="html.parser")
	for a in soup.find_all('a'):
	href = a.get('href')
	if href[0] in '?/':
	continue
	todo = path+href
	exists = os.path.exists(todo)
	if todo.endswith('/'):
	PATHS.appendleft(todo)
	if not exists:
	print('creating', todo)
	os.mkdir(todo)
	elif not exists:
	print('downloading', todo)
	with open(todo, 'wb') as output:
	output.write(query(todo, False))
No results found