Source code for duck.utils.urlcrack

"""
URLCrack - A lightweight module providing a robust URL class for parsing and manipulating URLs without relying on the `urllib` module. 

This module handles URLs gracefully, even those without a scheme, addressing limitations found in `urllib.parse` and similar libraries.

## Features:
- Parse and manipulate URLs effortlessly.
- Supports URLs with or without schemes.
- Easily update host, port, query, and other components.

``` {note}
This method is more reliable than `urllib` and similar packages, as they often struggle to handle URLs that lack a scheme (e.g., `https`).
```

## Example Usage:

```py
from urlcrack import URL

url_obj = URL('digreatbrian.tech/some/path?query=something#resource')

# Manipulate the URL object
url_obj.host = "new_site.com"
url_obj.port = 1234  # Set port to None to remove it
    
print(url_obj.to_str()) 
# Output: new_site.com:1234/some/path?query=something#resource
```

## Author:
Brian Musakwa <digreatbrian@gmail.com>
"""

import re
import os

from functools import lru_cache
from typing import (
    Tuple,
    Union,
    Optional,
    List,
)


__author__ = "Brian Musakwa"
__email__ = "digreatbrian@gmail.com"



[docs]
class InvalidURLPathError(Exception):
    """
    Raised when the URL path is invalid or does not meet expected criteria.
    """
    pass




[docs]
class InvalidURLError(Exception):
    """
    Raised when the URL is invalid or improperly formatted.
    """
    pass




[docs]
class InvalidURLAuthorityError(Exception):
    """
    Raised when the authority (netloc) of the URL is invalid.
    """
    pass

    


[docs]
class InvalidPortError(Exception):
    """
    Raised when the port of the URL is invalid.
    """
    pass




[docs]
def joinpaths(path1: str, path2: str, *more):
    """
    Returns joined paths but makes sure all paths are included in the final path rather than os.path.join
    """
    path1 = path1.rstrip("/")
    path2 = path2.lstrip("/")  # clean paths
    finalpath = os.path.join(path1, path2)

    for p in more:
        finalpath = finalpath.rstrip("/")
        p = p.lstrip("/")
        finalpath = os.path.join(finalpath, p)
    return finalpath




[docs]
class URL:
    """
    Lightweight URL class for manipulating and parsing URLs.
    
    This class works on urls without scheme unlike urllib.parse and other libraries.
    """
    
    __slots__ = {
        "scheme",
        "netloc",
        "path",
        "query",
        "fragment",
    }
    
    def __init__(self, url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None):
        self.scheme = ''
        self.netloc = ''
        self.path = ''
        self.query = ''
        self.fragment = ''
        self.parse(url, normalize_url, normalization_ignore_chars=normalization_ignore_chars)
    
    @property
    def is_absolute(self) -> bool:
        """
        Returns boolean on whether this URL is an absolute URL.
        """
        return bool(self.scheme)
        
    @property
    def user_info(self) -> Optional[str]:
        """
        Returns the user info like username@passwd in URL.
        """
        if '@' in self.netloc:
            user_info, host = self.netloc.rsplit('@', 1)
            return user_info or None
    
    @property
    def host(self) -> Optional[str]:
        """
        Returns the host (excluding port) from the URL object.
        """
        if self.netloc:
            host, port = self.split_host_and_port(self.netloc)
            if '@' in host:
                user_info, host = host.rsplit('@', 1)
            return host or None
    
    @host.setter  
    def host(self, host: str):
        """
        Sets the URL host eg (some-host.com).
        """
        if self.netloc:
            old_host, port = self.split_host_and_port(self.netloc)
            user_info = self.user_info
            if port:
                if user_info:
                    self.netloc = f'{user_info}@{host}:{port}'
                else:
                    self.netloc = f'{host}:{port}'
            else:
                if user_info:
                    self.netloc = f'{user_info}@{host}'
                else:
                    self.netloc = f'{host}'
        else:
            self.netloc = str(host)
     
    @property
    def port(self) -> Optional[int]:
        """
        Returns the port from the URL object.
        """
        if self.netloc:
            host, port = self.split_host_and_port(self.netloc)
            return port or None
    
    @port.setter  
    def port(self, port: int):
        """
        Sets the port in URL authority (netloc).
        """
        if self.netloc:
            host, old_port = self.split_host_and_port(self.netloc)
            user_info = self.user_info
            if not port:
                if user_info:
                    self.netloc = f'{user_info}@{host}'
                else:
                    self.netloc = f'{host}'
            else:
               if user_info:
                    self.netloc = f'{user_info}@{host}:{port}'
               else:
                    self.netloc = f'{host}:{port}'
        else:
            raise InvalidURLAuthorityError("Cannot set port for URL without authority (port)")
    

[docs]
    @classmethod
    def urljoin(
        cls,
        base_url: str,
        head_url: str,
        replace_authority: bool = False,
        full_path_replacement: bool = True,
        normalize_urls: bool = True,
        normalization_ignore_chars: Optional[List[str]] = None
     ) -> str:
        """
        Joins 2 URLs and return the result.
        
        Notes:
            If both URLs has schemes, The new URL will contain the base URL scheme.
        
        Args:
            base_url (str): The base URL
            head_url (str): The URL or URL path to concanetate to the base URL
            replace_netloc (bool): Whether to replace URL authority (netloc). If head url has a netloc, it will be the final netloc and this also replaces the
                final scheme if it is present in head URL. Defaults to False.
            full_path_replacement (bool): This means whether to replace the query and fragment even if they are empty in head URL. Defaults to True.
            nomalize_urls (bool): Whether to normalize urls.
            normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path.
                By default, all unsafe characters are stripped.
    
        Example:
            > https://duckframework.xyz/some/path + http://digreatbrian.tech/path/endpoint = https://digreatbrian.tech/some/path/endpoint
        """
        assert isinstance(base_url, str), f"Base URL must be an instance of str not {type(base_url)}"
        assert isinstance(head_url, str), f"Head URL must be an instance of str not {type(head_url)}"
        
        base_url_obj = URL(base_url, normalize_url=normalize_urls, normalization_ignore_chars=normalization_ignore_chars)
        head_url_obj = URL(head_url, normalize_url=normalize_urls, normalization_ignore_chars=normalization_ignore_chars)
        
        if head_url_obj.scheme:
            base_url_obj.scheme = head_url_obj.scheme
        
        if replace_authority or not base_url_obj.netloc:
            if head_url_obj.netloc:
                base_url_obj.netloc = head_url_obj.netloc
        
        if head_url_obj.path:
            if not base_url_obj.path or base_url_obj.path == '/':
                base_url_obj.path = head_url_obj.path
            else:
                base_url_obj.path = joinpaths(base_url_obj.path, head_url_obj.path)
            if full_path_replacement:
                base_url_obj.query = head_url_obj.query
                base_url_obj.fragment = head_url_obj.fragment
            else:
                if head_url_obj.query:
                    base_url_obj.query = head_url_obj.query
                if head_url_obj.fragment:
                    base_url_obj.fragment = head_url_obj.fragment
        return base_url_obj.to_str()

    

[docs]
    @classmethod
    def normalize_url_path(cls, url_path: str, ignore_chars: Optional[List[str]]=None):
        """
        This normalizes the URL path.
        """
        return URL.normalize_url('/' + url_path, ignore_chars)

        

[docs]
    @classmethod
    def normalize_url(cls, url: str, ignore_chars: Optional[List[str]]=None):
        """
        Normalizes a URL by removing consecutive slashes, adding a leading slash, removing trailing slashes, removing disallowed characters, e.g "<", string quotes (etc), replacing back slashes and lowercasing the scheme.
        """
        from urllib.parse import unquote
        
        # First unquote url
        try:
            url = unquote(url)
        except Exception:
            pass
        
        is_url_path = False
        ignore_chars = ignore_chars or []
        
        if not url:
            # url is None
            url = ""
        
        disallowed_chars = ("<", '"', "'", "^", ">", ";", "|", "{", "}", "`", " ")
        url = url.replace("\\", "/")
    
        # removing disallowed characters
        for i in disallowed_chars:
            if i not in ignore_chars:
                url = url.replace(i, "")
    
        # For urls in form "GET /] HTTP/1.1", or  "GET /],app-emailsubscribe,app-newsletter-widget,div.newsletter-image,div[data-newsletter-1],div[data-newsletter-2],gannett-atoms-component-newsletter-cta,hl-newsletter-cta,div HTTP/1.1"
        # The urls in form above may be provided by other browsers like 1DM
        if "," in url:
            url = url.split(",")[0].strip("]")
       
        if not url or url.startswith('/'):
            is_url_path = True
        
        url_obj = URL(url, normalize_url=False)
        normalized_path = re.sub(r"/+", "/", "/" + url_obj.path.strip("/"))
        url_obj.path = normalized_path
        url_str = url_obj.to_str()
        
        if is_url_path and not url_str.startswith('/'):
            url_str = '/' + url_str
        return url_str

    

[docs]
    def split_host_and_port(self, authority: str, convert_port_to_int: bool = True) -> Tuple[str, Union[str, int]]:
        """
        Returns the host and port from authority (netloc).
        
        Args:
            authority (str): The authority or netloc (usually in form 'some-host:port')
            convert_port_to_int (bool): Whether to automatically convert port to integer (only if port found). Defaults to True.
        
        Returns:
            Tuple: Tuple containing host and port.
        """
        try:
            scheme, netloc, leftover = self.split_scheme_and_authority(authority)
            if scheme:
                raise InvalidURLAuthorityError("URL Authority or Netloc must not contain scheme (eg. 'https://').")
        except InvalidURLError:
            raise InvalidURLAuthorityError("URL Authority or Netloc is not found, make sure authority doesn't start with 'scheme://' or forward slash ('/').")
        
        host, port = '', ''
        
        # Take account for IPV6 hosts
        if '[' and ']:' in authority:
            host, port = authority.rsplit(']:', 1)
        else:
            if ':' in authority:
                host, port = authority.rsplit(':', 1)
            else:
                host = authority
        if port and convert_port_to_int:
            try:
                port = int(port)
            except ValueError as e:
                raise InvalidPortError(f"Port obtained from authority (netloc) cannot be converted to integer: {e}")
        return host, port

        

[docs]
    def innerjoin(self, head_url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None) -> "URL":
        """
        Join the current URL with the provided `head_url`, and update the current URL object in-place.
    
        Args:
            head_url (str): The relative or absolute URL segment to join with the current URL.
            normalize_url (bool): Whether to normalize the url.
            normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path.
                By default, all unsafe characters are stripped.
            
        Behavior:
        - Performs a URL join operation similar to urllib.parse.urljoin.
        - The resulting URL replaces the current URL in this object.
        - Useful for modifying the current object without creating a new instance.
    
        Returns:
            self: The current URL object with the updated value.
        """
        new_url = URL.urljoin(
            self.to_str(),
            head_url,
            normalize_urls=normalize_url,
            normalization_ignore_chars=normalization_ignore_chars
        )
        self.parse(
            new_url,
            normalize_url=False,
        ) # Already normalized somehow
        return self

    

[docs]
    def join(self, head_url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None) -> "URL":
        """
        Join the current URL with the provided `head_url`, and return a new URL object.
    
        Args:
            head_url (str): The relative or absolute URL segment to join with the current URL.
            normalize_url (bool): Whether to normalize the url.
            normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path.
                By default, all unsafe characters are stripped.
            
        Behavior:
        - Performs a URL join operation similar to urllib.parse.urljoin.
        - Unlike `innerjoin()`, this does not modify the current object.
        - Returns a new instance with the resulting joined URL.
    
        Returns:
            URL: A new URL object with the combined URL.
        """
        new_url = URL.urljoin(
            self.to_str(),
            head_url,
            normalize_urls=normalize_url,
            normalization_ignore_chars=normalization_ignore_chars
        )
        return URL(new_url)

    

[docs]
    def split_scheme_and_authority(self, url: str) -> Tuple[str, str, str]:
        """
        Returns the scheme, authority  (netloc) and leftover (which might be the path most of the time) from a valid URL.
        
        Returns:
            Tuple: A tuple containing scheme, netloc and leftover (mostly the path).
        """
        scheme, netloc, leftover = '', '', ''
        
        if '://' in url:
           scheme, url = url.split('://', 1)
           if '/' in url:
               # URL form https://something/
               # something/
               netloc, url = url.split('/', 1)
               leftover = '/' + url
           else:
               # URL form https://something
               # something
               netloc = url
        else:
           if not url.startswith('/'):
               if  '/' in url:
                   netloc, leftover = url.split('/', 1)
                   leftover = '/' + leftover
               else:
                   netloc = url
        
        if not (scheme or netloc):
            raise InvalidURLError("URL invalid, should startwith a scheme (e.g 'https' or just the host). This might be a path being parsed here.")
        
        if '@' in scheme:
            # scheme has been left in this format 'user:pwd@https'
            user_info, scheme = scheme.rsplit('@', 1)
            netloc = '@'.join([user_info, netloc])
        
        return scheme, netloc, leftover

        

[docs]
    def split_path_components(self, url_path: str) -> Tuple[str, str, str]:
        """
        Returns the path components from a url path.
        
        Returns:
            Tuple: The tuple containg path, query and fragment.
        """
        try:
            scheme, netloc, url_path = self.split_scheme_and_authority(url_path)
            if scheme:
                raise InvalidURLPathError("URL Path must not include a scheme.")
            if netloc:
                raise InvalidURLPathError("URL Path must start with a forward slash '/'.")
        except InvalidURLError:
            # Confirmation that this is a valid path
            pass
        
        path, query, fragment = '', '', ''
        
        if '?' in url_path:
            path, url_path = url_path.split('?', 1)
            if '#' in url_path:
                query, fragment = url_path.split('#', 1)
            else:
                query = url_path
        else:
            if '#' in url_path:
                path, fragment = url_path.split('#', 1)
            else:
                path = url_path
        return path, query, fragment

    

[docs]
    def parse(self, url: str, normalize_url: bool = True, normalization_ignore_chars: Optional[List[str]] = None):
        """
        Parse URL from a string.
       
        Args:
           normalize_url (bool): Whether to normalize the URL e.g:
               https://// \\google.com>}////path?q`=some_query``; => https://google.com/path?q=some_query
           normalization_ignore_chars (Optional[List[str]]): List of characters to ignore when normalizing the url path.
            By default, all unsafe characters are stripped.
            
        Expected input:
           ```
           scheme://some-site.com/path/...
           scheme://some-site/...
           some-site.com/...
           /some-path/...
           ```
        """
        query, fragment = '', ''
       
        if normalize_url:
            url = URL.normalize_url(url, ignore_chars=normalization_ignore_chars)
       
        try:
            scheme, netloc, path = self.split_scheme_and_authority(url)
        except InvalidURLError:
            # The url parsed is a URL path instead.
            scheme = ''
            netloc = ''
            path = url
           
        if path:
            path, query, fragment = self.split_path_components(path)
       
        # Set attributes
        self.scheme, self.netloc, self.path, self.query, self.fragment = (
           scheme, netloc, path, query, fragment)

    

[docs]
    def to_str(self) -> str:
        return self.build_url_string(
            scheme=self.scheme,
            netloc=self.netloc,
            path=self.path,
            query=self.query,
            fragment=self.fragment,
        )

    

[docs]
    @lru_cache(maxsize=1024)
    def build_url_string(
        self,
        scheme: Optional[str] = None,
        netloc: Optional[str] = None,
        path: Optional[str] = None,
        query: Optional[str] = None,
        fragment: Optional[str] = None,
    ) -> str:
        """
        Converts the current URL object to string.
        """
        scheme = scheme or ""
        netloc = netloc or ""
        path = path or ""
        query = query or ""
        fragment = fragment or ""
            
        parts = []
        
        if scheme:
            parts.append(scheme + '://')
        
        if netloc:
            parts.append(netloc)
        
        if path:
            parts.append(path)
            
            # Only add query or fragment if path exists
            if query:
                parts.append('?' + query)
            
            if fragment:
                parts.append('#' + fragment)
        
        # Build URL        
        url = "".join(parts)
        
        if netloc:
            url = url.strip('/')
        
        return url

        

[docs]
    def __repr__(self):
        """
        Returns a string representation of the URL.

        Returns:
            str: String representation of the URL.
        """
        return (
            f"<{self.__class__.__name__} scheme='{self.scheme}' "
            f"netloc='{self.netloc}' "
            f"path='{self.path}' "
            f"query='{self.query}' "
            f"fragment='{self.fragment}'>"
        )