Skip to content

Instantly share code, notes, and snippets.

@leeprevost
Created December 16, 2023 02:32
Show Gist options
  • Select an option

  • Save leeprevost/5bb6fa9e5c7c02dea929edddac8b835e to your computer and use it in GitHub Desktop.

Select an option

Save leeprevost/5bb6fa9e5c7c02dea929edddac8b835e to your computer and use it in GitHub Desktop.
CCurl Python Class Object that creates Common Crawl Index information
import tldextract
from urllib.parse import urlparse
from dataclasses import dataclass, field
from surt import surt
from validators.url import url
from validators import ValidationError
dkw = dict(
compare=False,
init=False,
default="",
repr=True
)
@dataclass(order=True)
class CCUrl:
"""
definitions from: https://skeptric.com/common-crawl-index-athena/
"""
url_surtkey: str = field(compare=True, init=False, default=None) # Canonical form of URL with host name reversed
url_error: str = field(**dkw)
url: str = field(init=True, compare=False, repr=True) # URL that was archived
url_host_name: str = field(**dkw) # The host name
url_host_tld: str = field(**dkw) # TLD (e.g. au)
url_host_2nd_last_part: str = field(**dkw) # … url_host_5th_last_part: The parts of the host name separated by .
url_host_3rd_last_part: str = field(**dkw)
url_host_4th_last_part: str = field(**dkw)
url_host_5th_last_part: str = field(**dkw)
url_host_registry_suffix: str = field(**dkw) # .g. .com.au
url_host_registered_domain: str = field(**dkw)
url_host_private_suffix: str = field(**dkw)
url_host_private_domain: str = field(**dkw)
url_protocol: str = field(**dkw) # e.g. https
url_port: int = field(
**dkw) # The port accesed, it seems to be blank for default ports (80 for http, 443 for https).
url_path: str = field(
**dkw) # The path of the URL (everything from the first / to the query parameter starting at ?)
url_query: str = field(**dkw) # Query parameter; everything after the ?
def __post_init__(self):
valid = url(self.url)
if valid is True:
self._initialize()
elif isinstance(valid, ValidationError):
self.error = valid
def _initialize(self):
self.url_surtkey = surt(self.url)
self._tld_extract = tldextract.extract(self.url, include_psl_private_domains=True)
self._urlparse = urlparse(self.url)
_rev_url_parts = list(reversed(self._urlparse.netloc.split(".")))
self.url_host_tld = _rev_url_parts.pop(0)
part_names = [k for k in dir(self) if "last_part" in k]
#if len(_rev_url_parts) > 4:
# raise ValueError(f"Got more than 4 remaining parts in netloc: {_rev_url_parts}")
for part, name in zip(_rev_url_parts, part_names):
setattr(self, name, part) # sets up to the point _rev_url_parts after pop is exhausted.
self.url_protocol = self._urlparse.scheme
self.url_host_name = ".".join((part for part in (self._tld_extract.subdomain, self._tld_extract.domain, self._tld_extract.suffix) if part))
self.url_port = self._urlparse.port
# CCIndex seems to leave ports null if standard schemes. Already null with tldextract. But being explicit here for future.
if self.url_protocol == 'https':
if not self.url_port:
self.url_port = ""
elif self.url_protocol == 'http':
if not self.url_port:
self.url_port = ""
else:
if not self.url_port:
raise NotImplementedError(f"Got a scheme I don't understand: {self.url_protocol}")
self.url_path = self._urlparse.path
self.url_query = self._urlparse.query
if not self._tld_extract.is_private:
self.url_host_registered_domain = ".".join((self._tld_extract.domain, self._tld_extract.suffix))
self.url_host_registry_suffix = self._tld_extract.suffix
elif self._tld_extract.is_private:
self.url_host_private_domain = ".".join((self._tld_extract.domain, self._tld_extract.suffix))
self.url_host_private_suffix = self._tld_extract.suffix
if __name__ == "__main__":
urls = (
"https://commoncrawl.org/get-started",
"https://commoncrawl.org/overview",
"http://commoncrawl.com/overview",
"https://www.realestate.com.au/advice/",
"https://www.tech.com/search?query=database+tools&star_rating=4&order=alphabetical"
)
ccurls = list(map(CCUrl, urls))
for c in ccurls:
print(c)
print("---------------------------------------")
for c in sorted(ccurls):
print(c)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment