leeprevost · December 16, 2023 02:32
diff --git a/ccurl.py b/ccurl.py
 import tldextract
 from urllib.parse import urlparse
 from dataclasses import dataclass, field
 from surt import surt
 from validators.url import url
 from validators import ValidationError

 dkw = dict(
    compare=False,
    init=False,
    default="",
    repr=True
 )


 @dataclass(order=True)
 class CCUrl:
    """
    definitions from: https://skeptric.com/common-crawl-index-athena/
    """
    url_surtkey: str = field(compare=True, init=False, default=None)  # Canonical form of URL with host name reversed
    url_error: str = field(**dkw)
    url: str = field(init=True, compare=False, repr=True)  # URL that was archived
    url_host_name: str = field(**dkw)  # The host name
    url_host_tld: str = field(**dkw)  # TLD (e.g. au)
    url_host_2nd_last_part: str = field(**dkw)  # … url_host_5th_last_part: The parts of the host name separated by .
    url_host_3rd_last_part: str = field(**dkw)
    url_host_4th_last_part: str = field(**dkw)
    url_host_5th_last_part: str = field(**dkw)
    url_host_registry_suffix: str = field(**dkw)  # .g. .com.au
    url_host_registered_domain: str = field(**dkw)
    url_host_private_suffix: str = field(**dkw)
    url_host_private_domain: str = field(**dkw)
    url_protocol: str = field(**dkw)  # e.g. https
    url_port: int = field(
        **dkw)  # The port accesed, it seems to be blank for default ports (80 for http, 443 for https).
    url_path: str = field(
        **dkw)  # The path of the URL (everything from the first / to the query parameter starting at ?)
    url_query: str = field(**dkw)  # Query parameter; everything after the ?

    def __post_init__(self):
        valid = url(self.url)
        if valid is True:
            self._initialize()
        elif isinstance(valid, ValidationError):
            self.error = valid

    def _initialize(self):

        self.url_surtkey = surt(self.url)
        self._tld_extract = tldextract.extract(self.url, include_psl_private_domains=True)
        self._urlparse = urlparse(self.url)
        _rev_url_parts = list(reversed(self._urlparse.netloc.split(".")))
        self.url_host_tld = _rev_url_parts.pop(0)

        part_names = [k for k in dir(self) if "last_part" in k]
        #if len(_rev_url_parts) > 4:
        #    raise ValueError(f"Got more than 4 remaining parts in netloc: {_rev_url_parts}")
        for part, name in zip(_rev_url_parts, part_names):
            setattr(self, name, part)   # sets up to the point _rev_url_parts after pop is exhausted.
        self.url_protocol = self._urlparse.scheme

        self.url_host_name = ".".join((part for part in (self._tld_extract.subdomain, self._tld_extract.domain, self._tld_extract.suffix) if part))
        self.url_port = self._urlparse.port

        # CCIndex seems to leave ports null if standard schemes.  Already null with tldextract.  But being explicit here for future.
        if self.url_protocol == 'https':
            if not self.url_port:
                self.url_port = ""
        elif self.url_protocol == 'http':
            if not self.url_port:
                self.url_port = ""
        else:
            if not self.url_port:
                raise NotImplementedError(f"Got a scheme I don't understand: {self.url_protocol}")

        self.url_path = self._urlparse.path
        self.url_query = self._urlparse.query
        if not self._tld_extract.is_private:

            self.url_host_registered_domain = ".".join((self._tld_extract.domain, self._tld_extract.suffix))
            self.url_host_registry_suffix = self._tld_extract.suffix

        elif self._tld_extract.is_private:

            self.url_host_private_domain = ".".join((self._tld_extract.domain, self._tld_extract.suffix))
            self.url_host_private_suffix = self._tld_extract.suffix


 if __name__ == "__main__":

    urls = (
        "https://commoncrawl.org/get-started",
        "https://commoncrawl.org/overview",
        "http://commoncrawl.com/overview",
        "https://www.realestate.com.au/advice/",
        "https://www.tech.com/search?query=database+tools&star_rating=4&order=alphabetical"
    )

    ccurls = list(map(CCUrl, urls))

    for c in ccurls:
        print(c)

    print("---------------------------------------")

    for c in sorted(ccurls):
        print(c)
	import tldextract
	from urllib.parse import urlparse
	from dataclasses import dataclass, field
	from surt import surt
	from validators.url import url
	from validators import ValidationError

	dkw = dict(
	compare=False,
	init=False,
	default="",
	repr=True
	)


	@dataclass(order=True)
	class CCUrl:
	"""
	definitions from: https://skeptric.com/common-crawl-index-athena/
	"""
	url_surtkey: str = field(compare=True, init=False, default=None) # Canonical form of URL with host name reversed
	url_error: str = field(**dkw)
	url: str = field(init=True, compare=False, repr=True) # URL that was archived
	url_host_name: str = field(**dkw) # The host name
	url_host_tld: str = field(**dkw) # TLD (e.g. au)
	url_host_2nd_last_part: str = field(**dkw) # … url_host_5th_last_part: The parts of the host name separated by .
	url_host_3rd_last_part: str = field(**dkw)
	url_host_4th_last_part: str = field(**dkw)
	url_host_5th_last_part: str = field(**dkw)
	url_host_registry_suffix: str = field(**dkw) # .g. .com.au
	url_host_registered_domain: str = field(**dkw)
	url_host_private_suffix: str = field(**dkw)
	url_host_private_domain: str = field(**dkw)
	url_protocol: str = field(**dkw) # e.g. https
	url_port: int = field(
	**dkw) # The port accesed, it seems to be blank for default ports (80 for http, 443 for https).
	url_path: str = field(
	**dkw) # The path of the URL (everything from the first / to the query parameter starting at ?)
	url_query: str = field(**dkw) # Query parameter; everything after the ?

	def __post_init__(self):
	valid = url(self.url)
	if valid is True:
	self._initialize()
	elif isinstance(valid, ValidationError):
	self.error = valid

	def _initialize(self):

	self.url_surtkey = surt(self.url)
	self._tld_extract = tldextract.extract(self.url, include_psl_private_domains=True)
	self._urlparse = urlparse(self.url)
	_rev_url_parts = list(reversed(self._urlparse.netloc.split(".")))
	self.url_host_tld = _rev_url_parts.pop(0)

	part_names = [k for k in dir(self) if "last_part" in k]
	#if len(_rev_url_parts) > 4:
	# raise ValueError(f"Got more than 4 remaining parts in netloc: {_rev_url_parts}")
	for part, name in zip(_rev_url_parts, part_names):
	setattr(self, name, part) # sets up to the point _rev_url_parts after pop is exhausted.
	self.url_protocol = self._urlparse.scheme

	self.url_host_name = ".".join((part for part in (self._tld_extract.subdomain, self._tld_extract.domain, self._tld_extract.suffix) if part))
	self.url_port = self._urlparse.port

	# CCIndex seems to leave ports null if standard schemes. Already null with tldextract. But being explicit here for future.
	if self.url_protocol == 'https':
	if not self.url_port:
	self.url_port = ""
	elif self.url_protocol == 'http':
	if not self.url_port:
	self.url_port = ""
	else:
	if not self.url_port:
	raise NotImplementedError(f"Got a scheme I don't understand: {self.url_protocol}")

	self.url_path = self._urlparse.path
	self.url_query = self._urlparse.query
	if not self._tld_extract.is_private:

	self.url_host_registered_domain = ".".join((self._tld_extract.domain, self._tld_extract.suffix))
	self.url_host_registry_suffix = self._tld_extract.suffix

	elif self._tld_extract.is_private:

	self.url_host_private_domain = ".".join((self._tld_extract.domain, self._tld_extract.suffix))
	self.url_host_private_suffix = self._tld_extract.suffix


	if __name__ == "__main__":

	urls = (
	"https://commoncrawl.org/get-started",
	"https://commoncrawl.org/overview",
	"http://commoncrawl.com/overview",
	"https://www.realestate.com.au/advice/",
	"https://www.tech.com/search?query=database+tools&star_rating=4&order=alphabetical"
	)

	ccurls = list(map(CCUrl, urls))

	for c in ccurls:
	print(c)

	print("---------------------------------------")

	for c in sorted(ccurls):
	print(c)
No results found