Source code for duck.utils.urlcrack

"""
URLCrack - A lightweight module providing a robust URL class for parsing and manipulating URLs without relying on the `urllib` module. 

This module handles URLs gracefully, even those without a scheme, addressing limitations found in `urllib.parse` and similar libraries.

## Features:
- Parse and manipulate URLs effortlessly.
- Supports URLs with or without schemes.
- Easily update host, port, query, and other components.

``` {note}
This method is more reliable than `urllib` and similar packages, as they often struggle to handle URLs that lack a scheme (e.g., `https`).
```

## Example Usage:

```py
from urlcrack import URL

url_obj = URL('digreatbrian.tech/some/path?query=something#resource')

# Manipulate the URL object
url_obj.host = "new_site.com"
url_obj.port = 1234  # Set port to None to remove it
    
print(url_obj.to_str()) 
# Output: new_site.com:1234/some/path?query=something#resource
```

## Author:
Brian Musakwa <digreatbrian@gmail.com>
"""

import re
import os

from functools import lru_cache
from typing import (
    Tuple,
    Union,
    Optional,
    List,
)


__author__ = "Brian Musakwa"
__email__ = "digreatbrian@gmail.com"


[docs] class InvalidURLPathError(Exception): """ Raised when the URL path is invalid or does not meet expected criteria. """ pass
[docs] class InvalidURLError(Exception): """ Raised when the URL is invalid or improperly formatted. """ pass
[docs] class InvalidURLAuthorityError(Exception): """ Raised when the authority (netloc) of the URL is invalid. """ pass
[docs] class InvalidPortError(Exception): """ Raised when the port of the URL is invalid. """ pass
[docs] def joinpaths(path1: str, path2: str, *more): """ Returns joined paths but makes sure all paths are included in the final path rather than os.path.join """ path1 = path1.rstrip("/") path2 = path2.lstrip("/") # clean paths finalpath = os.path.join(path1, path2) for p in more: finalpath = finalpath.rstrip("/") p = p.lstrip("/") finalpath = os.path.join(finalpath, p) return finalpath
[docs] class URL: """ Lightweight URL class for manipulating and parsing URLs. This class works on urls without scheme unlike urllib.parse and other libraries. """ __slots__ = { "scheme", "netloc", "path", "query", "fragment", } def __init__(self, url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None): self.scheme = '' self.netloc = '' self.path = '' self.query = '' self.fragment = '' self.parse(url, normalize_url, normalization_ignore_chars=normalization_ignore_chars) @property def is_absolute(self) -> bool: """ Returns boolean on whether this URL is an absolute URL. """ return bool(self.scheme) @property def user_info(self) -> Optional[str]: """ Returns the user info like username@passwd in URL. """ if '@' in self.netloc: user_info, host = self.netloc.rsplit('@', 1) return user_info or None @property def host(self) -> Optional[str]: """ Returns the host (excluding port) from the URL object. """ if self.netloc: host, port = self.split_host_and_port(self.netloc) if '@' in host: user_info, host = host.rsplit('@', 1) return host or None @host.setter def host(self, host: str): """ Sets the URL host eg (some-host.com). """ if self.netloc: old_host, port = self.split_host_and_port(self.netloc) user_info = self.user_info if port: if user_info: self.netloc = f'{user_info}@{host}:{port}' else: self.netloc = f'{host}:{port}' else: if user_info: self.netloc = f'{user_info}@{host}' else: self.netloc = f'{host}' else: self.netloc = str(host) @property def port(self) -> Optional[int]: """ Returns the port from the URL object. """ if self.netloc: host, port = self.split_host_and_port(self.netloc) return port or None @port.setter def port(self, port: int): """ Sets the port in URL authority (netloc). """ if self.netloc: host, old_port = self.split_host_and_port(self.netloc) user_info = self.user_info if not port: if user_info: self.netloc = f'{user_info}@{host}' else: self.netloc = f'{host}' else: if user_info: self.netloc = f'{user_info}@{host}:{port}' else: self.netloc = f'{host}:{port}' else: raise InvalidURLAuthorityError("Cannot set port for URL without authority (port)")
[docs] @classmethod def urljoin( cls, base_url: str, head_url: str, replace_authority: bool = False, full_path_replacement: bool = True, normalize_urls: bool = True, normalization_ignore_chars: Optional[List[str]] = None ) -> str: """ Joins 2 URLs and return the result. Notes: If both URLs has schemes, The new URL will contain the base URL scheme. Args: base_url (str): The base URL head_url (str): The URL or URL path to concanetate to the base URL replace_netloc (bool): Whether to replace URL authority (netloc). If head url has a netloc, it will be the final netloc and this also replaces the final scheme if it is present in head URL. Defaults to False. full_path_replacement (bool): This means whether to replace the query and fragment even if they are empty in head URL. Defaults to True. nomalize_urls (bool): Whether to normalize urls. normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path. By default, all unsafe characters are stripped. Example: > https://duckframework.xyz/some/path + http://digreatbrian.tech/path/endpoint = https://digreatbrian.tech/some/path/endpoint """ assert isinstance(base_url, str), f"Base URL must be an instance of str not {type(base_url)}" assert isinstance(head_url, str), f"Head URL must be an instance of str not {type(head_url)}" base_url_obj = URL(base_url, normalize_url=normalize_urls, normalization_ignore_chars=normalization_ignore_chars) head_url_obj = URL(head_url, normalize_url=normalize_urls, normalization_ignore_chars=normalization_ignore_chars) if head_url_obj.scheme: base_url_obj.scheme = head_url_obj.scheme if replace_authority or not base_url_obj.netloc: if head_url_obj.netloc: base_url_obj.netloc = head_url_obj.netloc if head_url_obj.path: if not base_url_obj.path or base_url_obj.path == '/': base_url_obj.path = head_url_obj.path else: base_url_obj.path = joinpaths(base_url_obj.path, head_url_obj.path) if full_path_replacement: base_url_obj.query = head_url_obj.query base_url_obj.fragment = head_url_obj.fragment else: if head_url_obj.query: base_url_obj.query = head_url_obj.query if head_url_obj.fragment: base_url_obj.fragment = head_url_obj.fragment return base_url_obj.to_str()
[docs] @classmethod def normalize_url_path(cls, url_path: str, ignore_chars: Optional[List[str]]=None): """ This normalizes the URL path. """ return URL.normalize_url('/' + url_path, ignore_chars)
[docs] @classmethod def normalize_url(cls, url: str, ignore_chars: Optional[List[str]]=None): """ Normalizes a URL by removing consecutive slashes, adding a leading slash, removing trailing slashes, removing disallowed characters, e.g "<", string quotes (etc), replacing back slashes and lowercasing the scheme. """ from urllib.parse import unquote # First unquote url try: url = unquote(url) except Exception: pass is_url_path = False ignore_chars = ignore_chars or [] if not url: # url is None url = "" disallowed_chars = ("<", '"', "'", "^", ">", ";", "|", "{", "}", "`", " ") url = url.replace("\\", "/") # removing disallowed characters for i in disallowed_chars: if i not in ignore_chars: url = url.replace(i, "") # For urls in form "GET /] HTTP/1.1", or "GET /],app-emailsubscribe,app-newsletter-widget,div.newsletter-image,div[data-newsletter-1],div[data-newsletter-2],gannett-atoms-component-newsletter-cta,hl-newsletter-cta,div HTTP/1.1" # The urls in form above may be provided by other browsers like 1DM if "," in url: url = url.split(",")[0].strip("]") if not url or url.startswith('/'): is_url_path = True url_obj = URL(url, normalize_url=False) normalized_path = re.sub(r"/+", "/", "/" + url_obj.path.strip("/")) url_obj.path = normalized_path url_str = url_obj.to_str() if is_url_path and not url_str.startswith('/'): url_str = '/' + url_str return url_str
[docs] def split_host_and_port(self, authority: str, convert_port_to_int: bool = True) -> Tuple[str, Union[str, int]]: """ Returns the host and port from authority (netloc). Args: authority (str): The authority or netloc (usually in form 'some-host:port') convert_port_to_int (bool): Whether to automatically convert port to integer (only if port found). Defaults to True. Returns: Tuple: Tuple containing host and port. """ try: scheme, netloc, leftover = self.split_scheme_and_authority(authority) if scheme: raise InvalidURLAuthorityError("URL Authority or Netloc must not contain scheme (eg. 'https://').") except InvalidURLError: raise InvalidURLAuthorityError("URL Authority or Netloc is not found, make sure authority doesn't start with 'scheme://' or forward slash ('/').") host, port = '', '' # Take account for IPV6 hosts if '[' and ']:' in authority: host, port = authority.rsplit(']:', 1) else: if ':' in authority: host, port = authority.rsplit(':', 1) else: host = authority if port and convert_port_to_int: try: port = int(port) except ValueError as e: raise InvalidPortError(f"Port obtained from authority (netloc) cannot be converted to integer: {e}") return host, port
[docs] def innerjoin(self, head_url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None) -> "URL": """ Join the current URL with the provided `head_url`, and update the current URL object in-place. Args: head_url (str): The relative or absolute URL segment to join with the current URL. normalize_url (bool): Whether to normalize the url. normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path. By default, all unsafe characters are stripped. Behavior: - Performs a URL join operation similar to urllib.parse.urljoin. - The resulting URL replaces the current URL in this object. - Useful for modifying the current object without creating a new instance. Returns: self: The current URL object with the updated value. """ new_url = URL.urljoin( self.to_str(), head_url, normalize_urls=normalize_url, normalization_ignore_chars=normalization_ignore_chars ) self.parse( new_url, normalize_url=False, ) # Already normalized somehow return self
[docs] def join(self, head_url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None) -> "URL": """ Join the current URL with the provided `head_url`, and return a new URL object. Args: head_url (str): The relative or absolute URL segment to join with the current URL. normalize_url (bool): Whether to normalize the url. normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path. By default, all unsafe characters are stripped. Behavior: - Performs a URL join operation similar to urllib.parse.urljoin. - Unlike `innerjoin()`, this does not modify the current object. - Returns a new instance with the resulting joined URL. Returns: URL: A new URL object with the combined URL. """ new_url = URL.urljoin( self.to_str(), head_url, normalize_urls=normalize_url, normalization_ignore_chars=normalization_ignore_chars ) return URL(new_url)
[docs] def split_scheme_and_authority(self, url: str) -> Tuple[str, str, str]: """ Returns the scheme, authority (netloc) and leftover (which might be the path most of the time) from a valid URL. Returns: Tuple: A tuple containing scheme, netloc and leftover (mostly the path). """ scheme, netloc, leftover = '', '', '' if '://' in url: scheme, url = url.split('://', 1) if '/' in url: # URL form https://something/ # something/ netloc, url = url.split('/', 1) leftover = '/' + url else: # URL form https://something # something netloc = url else: if not url.startswith('/'): if '/' in url: netloc, leftover = url.split('/', 1) leftover = '/' + leftover else: netloc = url if not (scheme or netloc): raise InvalidURLError("URL invalid, should startwith a scheme (e.g 'https' or just the host). This might be a path being parsed here.") if '@' in scheme: # scheme has been left in this format 'user:pwd@https' user_info, scheme = scheme.rsplit('@', 1) netloc = '@'.join([user_info, netloc]) return scheme, netloc, leftover
[docs] def split_path_components(self, url_path: str) -> Tuple[str, str, str]: """ Returns the path components from a url path. Returns: Tuple: The tuple containg path, query and fragment. """ try: scheme, netloc, url_path = self.split_scheme_and_authority(url_path) if scheme: raise InvalidURLPathError("URL Path must not include a scheme.") if netloc: raise InvalidURLPathError("URL Path must start with a forward slash '/'.") except InvalidURLError: # Confirmation that this is a valid path pass path, query, fragment = '', '', '' if '?' in url_path: path, url_path = url_path.split('?', 1) if '#' in url_path: query, fragment = url_path.split('#', 1) else: query = url_path else: if '#' in url_path: path, fragment = url_path.split('#', 1) else: path = url_path return path, query, fragment
[docs] def parse(self, url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None): """ Parse URL from a string. Args: normalize_url (bool): Whether to normalize the URL e.g: https://// \\google.com>}////path?q`=some_query``; => https://google.com/path?q=some_query normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path. By default, all unsafe characters are stripped. Expected input: ``` scheme://some-site.com/path/... scheme://some-site/... some-site.com/... /some-path/... ``` """ query, fragment = '', '' if normalize_url: url = URL.normalize_url(url, ignore_chars=normalization_ignore_chars) try: scheme, netloc, path = self.split_scheme_and_authority(url) except InvalidURLError: # The url parsed is a URL path instead. scheme = '' netloc = '' path = url if path: path, query, fragment = self.split_path_components(path) # Set attributes self.scheme, self.netloc, self.path, self.query, self.fragment = ( scheme, netloc, path, query, fragment)
[docs] def to_str(self) -> str: return self.build_url_string( scheme=self.scheme, netloc=self.netloc, path=self.path, query=self.query, fragment=self.fragment, )
[docs] @lru_cache(maxsize=1024) def build_url_string( self, scheme: Optional[str] = None, netloc: Optional[str] = None, path: Optional[str] = None, query: Optional[str] = None, fragment: Optional[str] = None, ) -> str: """ Converts the current URL object to string. """ scheme = scheme or "" netloc = netloc or "" path = path or "" query = query or "" fragment = fragment or "" parts = [] if scheme: parts.append(scheme + '://') if netloc: parts.append(netloc) if path: parts.append(path) # Only add query or fragment if path exists if query: parts.append('?' + query) if fragment: parts.append('#' + fragment) # Build URL url = "".join(parts) if netloc: url = url.strip('/') return url
[docs] def __repr__(self): """ Returns a string representation of the URL. Returns: str: String representation of the URL. """ return ( f"<{self.__class__.__name__} scheme='{self.scheme}' " f"netloc='{self.netloc}' " f"path='{self.path}' " f"query='{self.query}' " f"fragment='{self.fragment}'>" )