Utilities

Utilities to index text.

Main functions

base_hash: a hash generation function for strings generate_uuid: a UUID represntation of a string generate_random_string: a random string of required length

Behaviour

these functions are supposedly pure.

`base_hash(input_string)`

Generate human-readable hash to check changes in strings.

Parameters:

Name	Type	Description	Default
`input_string`	`str`	an input string	required

Returns:

Type	Description
`str`	a hash string

Source code in lmm/utils/hash.py

def base_hash(input_string: str) -> str:
    """
    Generate human-readable hash to check changes in strings.

    Args: 
        input_string: an input string

    Returns: 
        a hash string
    """
    if not input_string:
        return ""

    # Encode the input string to bytes
    encoded_string = input_string.encode('utf-8')

    # Using MD5 for performance, crypto quality not required
    md5_hasher = hashlib.md5()
    md5_hasher.update(encoded_string)

    # Convert to human-readable
    byte_digest_md5 = md5_hasher.digest()
    base64_digest_md5 = base64.b64encode(byte_digest_md5).decode(
        'utf-8'
    )

    return base64_digest_md5[:-2]

`generate_random_string(length=18)`

Generates a random string.

Parameters:

Name	Type	Description	Default
`length`	`int`	the length of the random string (defaults to 18 characters).	`18`

Returns:

Type	Description
`str`	a random string of the required length.

Source code in lmm/utils/hash.py

def generate_random_string(length: int = 18) -> str:
    """Generates a random string.

    Args:
        length: the length of the random string (defaults to 18
            characters).

    Returns:
        a random string of the required length.
    """
    import secrets
    import string

    alphabet = string.ascii_letters + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))

`generate_uuid(text_input, namespace_uuid=uuid.NAMESPACE_URL)`

Generates a UUID Version 5 from a given text string using a specified namespace.

UUID v5 is based on SHA-1 hashing, ensuring that the same text input with the same namespace will always produce the same UUID.

Parameters:

Name	Type	Description	Default
`text_input`	`str`	The string from which to generate the UUID.	required
`namespace_uuid`	`UUID object`	The namespace UUID. Defaults to uuid.NAMESPACE_URL. You can use other predefined namespaces (e.g., uuid.NAMESPACE_DNS) or define your own.	`NAMESPACE_URL`

Returns:

Name	Type	Description
`str`	`str`	The generated UUID v5 as a hyphenated string (36 chars).

Source code in lmm/utils/hash.py

def generate_uuid(
    text_input: str, namespace_uuid: uuid.UUID = uuid.NAMESPACE_URL
) -> str:
    """
    Generates a UUID Version 5 from a given text string using a
    specified namespace.

    UUID v5 is based on SHA-1 hashing, ensuring that the same text
    input with the same namespace will always produce the same UUID.

    Args:
        text_input (str): The string from which to generate the UUID.
        namespace_uuid (UUID object, optional): The namespace UUID.
                                Defaults to uuid.NAMESPACE_URL.
                                You can use other predefined
                                namespaces (e.g., uuid.NAMESPACE_DNS)
                                or define your own.

    Returns:
        str: The generated UUID v5 as a hyphenated string (36 chars).
    """
    generated_uuid: uuid.UUID = uuid.uuid5(namespace_uuid, text_input)
    return str(generated_uuid)

Utilities to read/write to/from disc and print errors to console. Errors are not propagated, but functions return null value.

`append_postfix_to_filename(filename, postfix)`

Appends a postfix string to the name of a file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	The original name of the file (e.g., "my_document.txt").	required
`postfix`	`str`	The string to append (e.g., "_new").	required

Returns:

Name	Type	Description
`str`	`str`	The new filename with the postfix appended.

Source code in lmm/utils/ioutils.py

def append_postfix_to_filename(filename: str, postfix: str) -> str:
    """
    Appends a postfix string to the name of a file.

    Args:
        filename (str): The original name of the file (e.g.,
            "my_document.txt").
        postfix (str): The string to append (e.g., "_new").

    Returns:
        str: The new filename with the postfix appended.
    """
    import os

    base_name, extension = os.path.splitext(filename)
    new_filename = f"{base_name}{postfix}{extension}"
    return new_filename

`check_allowed_content(input_string, allowed_list)`

Extracts strings delimited by single quotes from input_string and checks if any of them are in the allowed_list.

Parameters:

Name	Type	Description	Default
`input_string`	`str`	The string to extract quoted content from.	required
`allowed_list`	`list[str]`	List of strings to check against.	required

Returns:

Type	Description
`bool`	True if any extracted string is in allowed_list, False otherwise.

Source code in lmm/utils/ioutils.py

def check_allowed_content(
    input_string: str, allowed_list: list[str]
) -> bool:
    """
    Extracts strings delimited by single quotes from input_string and checks
    if any of them are in the allowed_list.

    Args:
        input_string: The string to extract quoted content from.
        allowed_list: List of strings to check against.

    Returns:
        True if any extracted string is in allowed_list, False otherwise.
    """
    import re

    # Firth just check is in list
    if input_string in allowed_list:
        return True

    # Fallback, extract all strings delimited by single quotes
    pattern = r"'([^']*)'"
    extracted_strings = re.findall(pattern, input_string)

    # Check if any extracted string is in the allowed list
    for extracted in extracted_strings:
        if extracted in allowed_list:
            return True

    return False

`clean_text_concat(text_segments)`

Concatenates a list of strings, merging overlapping tails/heads if the overlap constitutes at least one whole word.

The merge condition requires: 1. The tail of text A matches the head of text B. 2. The match represents a complete word boundary on both sides: - The character preceding the overlap in A must not be alphanumeric (or A starts with the overlap). - The character following the overlap in B must not be alphanumeric (or B ends with the overlap). 3. The overlap contains at least one alphanumeric character (to ensure it's "at least a word" and not just whitespace/punctuation).

Parameters:

Name	Type	Description	Default
`text_segments`	`list[str]`	A list of strings to concatenate.	required

Returns:

Type	Description
`str`	A single concatenated string with overlaps merged.

Source code in lmm/utils/ioutils.py

def clean_text_concat(text_segments: list[str]) -> str:
    """
    Concatenates a list of strings, merging overlapping tails/heads
    if the overlap constitutes at least one whole word.

    The merge condition requires:
    1. The tail of text A matches the head of text B.
    2. The match represents a complete word boundary on both sides:
       - The character preceding the overlap in A must not be alphanumeric (or A starts with the overlap).
       - The character following the overlap in B must not be alphanumeric (or B ends with the overlap).
    3. The overlap contains at least one alphanumeric character (to ensure it's "at least a word"
       and not just whitespace/punctuation).

    Args:
        text_segments: A list of strings to concatenate.

    Returns:
        A single concatenated string with overlaps merged.
    """
    if not text_segments:
        return ""

    # Initialize with the first segment
    result_text = text_segments[0]

    for next_segment in text_segments[1:]:
        result_text = _merge_segments(result_text, next_segment)

    return result_text

`create_interface(f, argv)`

Waits for Enter key presses and handles Ctrl-C to enable interactive execution of the function f and for debugging. The first command-line argument is the markdown file on which the module acts. An optional second command-line argument is the file to which changes are saved. A third command line argument, if True, creates a loop for interactive editing.

Source code in lmm/utils/ioutils.py

@validate_call
def create_interface(
    f: Callable[[str, str], list[Block] | None], argv: list[str]
) -> None:
    """Waits for Enter key presses and handles Ctrl-C to
    enable interactive execution of the function f and for debugging.
    The first command-line argument is the markdown file on
    which the module acts. An optional second command-line
    argument is the file to which changes are saved. A third
    command line argument, if True, creates a loop for interactive
    editing.
    """
    if len(argv) > 1:
        filename = argv[1]
    else:
        print("Usage: first command line arg is source file")
        print("       second command line arg is save file (opt)")
        print("       third command line 'True' enters loop")
        return
    if len(argv) > 2:
        target = argv[2]
    else:
        target = filename

    if not validate_file(filename):
        return

    if len(argv) > 3:
        interactive = argv[3] == "True"
    else:
        interactive = False

    if not interactive:
        f(filename, target)
        return

    print(f"Press 'Enter' to execute the function on '{filename}'.")
    print("Press 'Ctrl-C' to exit.")

    try:
        input()
        while True:
            f(filename, target)
            # Waits for the user to press Enter
            input("Press 'Enter' to continue, 'Ctrl-C' to exit")
    except KeyboardInterrupt:
        print("\nCtrl-C detected. Exiting program.")
    except Exception as e:
        print("An unexpected error occurred: " + str(e))
    finally:
        print("Program gracefully terminated.")

`list_files_with_extensions(folder_path, extensions)`

Lists all files in a given folder that match a set of specified extensions.

Parameters:

Name	Type	Description	Default
`folder_path`	`str \| Path`	The full path to the folder to search.	required
`extensions`	`str \| list[str]`	A single semicolon-separated string of file extensions (e.g., ".txt;.md;py") OR a standard list of strings (e.g., ['.txt', 'md']). Extensions may or may not start with a dot.	required

Returns:

Type	Description
`list[str]`	A list of full paths (as strings) for all matching files. Returns an
`list[str]`	empty list if no files are found.

Raises:

Type	Description
`FileNotFoundError`	If the specified folder_path does not exist.
`NotADirectoryError`	If the specified folder_path is not a directory.
`ValueError`	If the extensions string contains invalid characters for a filename.

Source code in lmm/utils/ioutils.py

def list_files_with_extensions(
    folder_path: str | Path, extensions: str | list[str]
) -> list[str]:
    """
    Lists all files in a given folder that match a set of specified extensions.

    Args:
        folder_path (str | Path): The full path to the folder to search.
        extensions (str | list[str]): A single semicolon-separated string of 
            file extensions (e.g., ".txt;.md;py") OR a standard list of strings 
            (e.g., ['.txt', 'md']). Extensions may or may not start with a dot.

    Returns:
        A list of full paths (as strings) for all matching files. Returns an
        empty list if no files are found.

    Raises:
        FileNotFoundError: If the specified folder_path does not exist.
        NotADirectoryError: If the specified folder_path is not a directory.
        ValueError: If the extensions string contains invalid characters for
            a filename.
    """
    # --- 1. Validate folder path ---
    p_folder = Path(folder_path)
    if not p_folder.exists():
        raise FileNotFoundError(
            f"The folder does not exist: '{folder_path}'"
        )
    if not p_folder.is_dir():
        raise NotADirectoryError(
            f"The specified path is not a directory: '{folder_path}'"
        )

    # --- 2. Process and Normalize Extensions ---
    raw_extensions: list[str] = []

    if isinstance(extensions, str):
        # Handle the semicolon-separated string input
        if not extensions:
            return []
        raw_extensions = extensions.split(';')
    elif isinstance(extensions, list):  # type: ignore (always met)
        # Handle the standard list input
        raw_extensions = extensions
    else:
        # Catch unexpected types
        raise TypeError(
            "Unreacheable code reached. Extensions supposed to be " 
            "a string (semicolon-separated) or a list of strings."
        )

    # Define invalid characters for filenames
    # This remains critical for security and robustness.
    invalid_chars = r'<>:"/\|?*' + "".join(map(chr, range(32)))

    processed_extensions: set[str] = set()
    for ext in raw_extensions:
        ext = str(ext).strip() # Ensure it's a string and strip whitespace
        if not ext:
            continue

        # Check for invalid characters
        if any(char in invalid_chars for char in ext):
            raise ValueError(
                f"Invalid character found in extension '{ext}'. Extensions cannot "
                f"contain any of the following: {invalid_chars}"
            )

        # Prepend dot if missing and store in the set
        if not ext.startswith('.'):
            processed_extensions.add('.' + ext.lower()) # Added .lower() for case-insensitivity
        else:
            processed_extensions.add(ext.lower()) # Added .lower() for case-insensitivity

    if not processed_extensions:
        return []

    # --- 3. Find matching files ---
    # Note: Using Path.suffix is case-sensitive, so we lower-case it here 
    # to match the lower-cased processed_extensions set.
    matching_files: list[str] = [
        str(file_path)
        for file_path in p_folder.iterdir()
        if file_path.is_file()
        and file_path.suffix.lower() in processed_extensions
    ]

    return matching_files

`parse_external_boolean(value)`

Sanitize externally given boolean

Source code in lmm/utils/ioutils.py

def parse_external_boolean(value: object) -> bool:
    """Sanitize externally given boolean"""
    if isinstance(value, str):
        if value.lower() in ('true', '1', 'yes'):
            return True
        elif value.lower() in ('false', '0', 'no', ''):
            return False
        # Handle other string interpretations as needed
    # Fallback to Python's default truthiness for other types
    return bool(value)

`process_string_quotes(input_string)`

Processes a string to ensure consistent internal quoting.

Rules: - If the string contains the character ", except for the first and last character, replace it with ' and make sure the string starts and ends with ". - If the string contains the character ', make sure the string starts and ends with ".

In short, the quote should create a string that can internally quote text with a consistent approach, starting from a string that may do so using different ways.

Parameters:

Name	Type	Description	Default
`input_string`	`str`	The string to be processed.	required

Returns:

Type	Description
`str`	The processed string with consistent quoting.

Source code in lmm/utils/ioutils.py

def process_string_quotes(input_string: str) -> str:
    """
    Processes a string to ensure consistent internal quoting.

    Rules:
    - If the string contains the character ", except for the first
    and last character, replace it with ' and make sure the string
    starts and ends with ".
    - If the string contains the character ', make sure the string
    starts and ends with ".

    In short, the quote should create a string that can internally
    quote text with a consistent approach, starting from a string
    that may do so using different ways.

    Args:
        input_string: The string to be processed.

    Returns:
        The processed string with consistent quoting.
    """

    # Step 1: Remove any existing outer quotes to get the core content
    core_content = input_string

    # Check if the string starts and ends with double quotes
    if (
        len(core_content) >= 2
        and core_content.startswith('"')
        and core_content.endswith('"')
    ):
        core_content = core_content[1:-1]
    # Check if the string starts and ends with single quotes
    elif (
        len(core_content) >= 2
        and core_content.startswith("'")
        and core_content.endswith("'")
    ):
        core_content = core_content[1:-1]

    # Step 2: Handle internal double quotes
    # If the core content contains double quotes, replace them all
    # with single quotes
    # This ensures that internal quoting consistently uses single
    # quotes when the outer is double.
    if '"' in core_content:
        processed_internal_content = core_content.replace('"', "'")
    else:
        processed_internal_content = core_content

    # Step 3: Ensure the final string starts and ends with double
    # quotes
    # This applies to both cases: if it originally had internal
    # double quotes (now replaced with single), or if it had internal
    # single quotes, or no quotes.
    if "'" in processed_internal_content:
        final_string = '"' + processed_internal_content + '"'
    else:
        final_string = processed_internal_content

    return final_string

`string_to_path_or_string(input_string)`

Takes a string as argument. If the string is one line, checks that the string codes for an existing file. If so, it returns a Path object for that file. Otherwise, it returns the string.

A string is considered one line if it contains no newlines, or if it only has a single trailing newline character.

Parameters:

Name	Type	Description	Default
`input_string`	`str`	The input string to check	required

Returns:

Type	Description
`Path \| str`	Path object if the string represents an existing file,
`Path \| str`	otherwise the original string

Source code in lmm/utils/ioutils.py

def string_to_path_or_string(input_string: str) -> Path | str:
    """
    Takes a string as argument. If the string is one line, checks
    that the string codes for an existing file. If so, it returns a
    Path object for that file. Otherwise, it returns the string.

    A string is considered one line if it contains no newlines, or if
    it only has a single trailing newline character.

    Args:
        input_string: The input string to check

    Returns:
        Path object if the string represents an existing file,
        otherwise the original string
    """
    # Check if string is a single line (allowing for trailing \n)
    stripped_string = input_string.rstrip('\n\r')
    if '\n' in stripped_string or '\r' in stripped_string:
        return input_string

    # Try to create a Path object and check if it exists as a file
    try:
        potential_path = Path(stripped_string.strip())
        if potential_path.exists() and potential_path.is_file():
            return potential_path
    except (OSError, ValueError):
        # Invalid path characters or other path-related errors
        pass

    # Return original string if not a valid existing file
    return input_string

`validate_file(source, logger=logger)`

Returns: None for failure, Path object otherwise

Source code in lmm/utils/ioutils.py

def validate_file(
    source: str | Path, logger: LoggerBase = logger
) -> Path | None:
    """Returns: None for failure, Path object otherwise"""
    if not source:
        logger.warning("No file given")
        return None
    try:
        source_path = Path(source)
        if not source_path.exists():
            logger.error(f"File does not exist: {source}")
            return None
        if not source_path.is_file():
            logger.error(f"Not a file: {source}")
            return None
        if source_path.stat().st_size == 0:
            logger.warning(f"File is empty: {source}")
            return None
    except Exception as e:
        logger.error(f"Error accessing file {source}: {str(e)}")
        return None

    return source_path

Centralized logging configuration for the ML Markdown project.

This module provides a standardized way to configure and use Python's logging module across the entire project. It ensures consistent log formatting, appropriate log levels, and centralized configuration.

Usage

from library.lm_logging import get_logger, ConsoleLogger,
    FileLogger, ExceptionConsoleLogger

# Use the abstract interface implementations
console_logger = ConsoleLogger(__name__)
file_logger = FileLogger(__name__, "app.log")
exception_logger = ExceptionConsoleLogger(__name__)

# Or use the traditional logger
logger = get_logger(__name__)

`ConsoleLogger`

Bases: LoggerBase

A console logger implementation that uses logging.Logger as a delegate. Logs messages to the console using Python's built-in logging module.

Source code in lmm/utils/logging.py

class ConsoleLogger(LoggerBase):
    """
    A console logger implementation that uses logging.Logger as a
    delegate. Logs messages to the console using Python's built-in
    logging module.
    """

    def __init__(self, name: str | None = None) -> None:
        """
        Initialize the ConsoleLogger with a specific logger name,
        typically __name__ to use the module name
        """
        if name is not None or not bool(name):
            self.logger = logging.getLogger(name)
        else:
            self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)

        # Ensure we have a console handler if none exists
        if not self.logger.hasHandlers():
            handler = logging.StreamHandler(sys.stdout)
            formatter = logging.Formatter(
                '%(levelname)s - %(message)s'
            )
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

    def set_level(self, level: int) -> None:
        """Set the logging level for the logger."""
        self.logger.setLevel(level)

    def get_level(self) -> int:
        """Get the current logging level"""
        return self.logger.level

    def info(self, msg: str) -> None:
        """Log an informational message."""
        self.logger.info(msg)

    def error(self, msg: str) -> None:
        """Log an error message."""
        self.logger.error(msg)

    def warning(self, msg: str) -> None:
        """Log a warning message."""
        self.logger.warning(msg)

    def critical(self, msg: str) -> None:
        """Log a critical message."""
        self.logger.critical(msg, stack_info=True)

`init(name=None)`

Initialize the ConsoleLogger with a specific logger name, typically name to use the module name

Source code in lmm/utils/logging.py

def __init__(self, name: str | None = None) -> None:
    """
    Initialize the ConsoleLogger with a specific logger name,
    typically __name__ to use the module name
    """
    if name is not None or not bool(name):
        self.logger = logging.getLogger(name)
    else:
        self.logger = logging.getLogger()
    self.logger.setLevel(logging.INFO)

    # Ensure we have a console handler if none exists
    if not self.logger.hasHandlers():
        handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            '%(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)

`critical(msg)`

Log a critical message.

Source code in lmm/utils/logging.py

def critical(self, msg: str) -> None:
    """Log a critical message."""
    self.logger.critical(msg, stack_info=True)

`error(msg)`

Log an error message.

Source code in lmm/utils/logging.py

def error(self, msg: str) -> None:
    """Log an error message."""
    self.logger.error(msg)

`get_level()`

Get the current logging level

Source code in lmm/utils/logging.py

def get_level(self) -> int:
    """Get the current logging level"""
    return self.logger.level

`info(msg)`

Log an informational message.

Source code in lmm/utils/logging.py

def info(self, msg: str) -> None:
    """Log an informational message."""
    self.logger.info(msg)

`set_level(level)`

Set the logging level for the logger.

Source code in lmm/utils/logging.py

def set_level(self, level: int) -> None:
    """Set the logging level for the logger."""
    self.logger.setLevel(level)

`warning(msg)`

Log a warning message.

Source code in lmm/utils/logging.py

def warning(self, msg: str) -> None:
    """Log a warning message."""
    self.logger.warning(msg)

`ExceptionConsoleLogger`

Bases: LoggerBase

A console logger implementation that raises exceptions on error and critical calls.

This logger behaves like ConsoleLogger for info, warning, and set_level methods, but raises exceptions when error() or critical() methods are called. The message is still logged before the exception is raised.

Source code in lmm/utils/logging.py

class ExceptionConsoleLogger(LoggerBase):
    """
    A console logger implementation that raises exceptions on error
    and critical calls.

    This logger behaves like ConsoleLogger for info, warning, and
    set_level methods, but raises exceptions when error() or
    critical() methods are called.
    The message is still logged before the exception is raised.
    """

    def __init__(self, name: str = "") -> None:
        """
        Initialize the ExceptionConsoleLogger with a specific logger
        name.

        Args:
            name: The name of the logger, typically __name__ to use
                the module name
        """
        self.logger = logging.getLogger(f"{name}_exception")
        self.logger.setLevel(logging.INFO)

        # Ensure we have a console handler if none exists
        if not self.logger.handlers:
            handler = logging.StreamHandler(sys.stdout)
            formatter = logging.Formatter(
                '%(levelname)s - %(message)s'
            )
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

    def set_level(self, level: int) -> None:
        """Set the logging level for the logger."""
        self.logger.setLevel(level)

    def get_level(self) -> int:
        """Get the current logging level"""
        return self.logger.level

    def info(self, msg: str) -> None:
        """Log an informational message."""
        self.logger.info(msg)

    def error(self, msg: str) -> None:
        """Log an error message and raise an exception."""
        self.logger.error(msg)
        raise RuntimeError(f"Error: {msg}")

    def warning(self, msg: str) -> None:
        """Log a warning message."""
        self.logger.warning(msg)

    def critical(self, msg: str) -> None:
        """Log a critical message and raise an exception."""
        self.logger.critical(msg)
        raise RuntimeError(f"Critical error: {msg}")

`init(name='')`

Initialize the ExceptionConsoleLogger with a specific logger name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the logger, typically name to use the module name	`''`

Source code in lmm/utils/logging.py

def __init__(self, name: str = "") -> None:
    """
    Initialize the ExceptionConsoleLogger with a specific logger
    name.

    Args:
        name: The name of the logger, typically __name__ to use
            the module name
    """
    self.logger = logging.getLogger(f"{name}_exception")
    self.logger.setLevel(logging.INFO)

    # Ensure we have a console handler if none exists
    if not self.logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            '%(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)

`critical(msg)`

Log a critical message and raise an exception.

Source code in lmm/utils/logging.py

def critical(self, msg: str) -> None:
    """Log a critical message and raise an exception."""
    self.logger.critical(msg)
    raise RuntimeError(f"Critical error: {msg}")

`error(msg)`

Log an error message and raise an exception.

Source code in lmm/utils/logging.py

def error(self, msg: str) -> None:
    """Log an error message and raise an exception."""
    self.logger.error(msg)
    raise RuntimeError(f"Error: {msg}")

`get_level()`

Get the current logging level

Source code in lmm/utils/logging.py

def get_level(self) -> int:
    """Get the current logging level"""
    return self.logger.level

`info(msg)`

Log an informational message.

Source code in lmm/utils/logging.py

def info(self, msg: str) -> None:
    """Log an informational message."""
    self.logger.info(msg)

`set_level(level)`

Set the logging level for the logger.

Source code in lmm/utils/logging.py

def set_level(self, level: int) -> None:
    """Set the logging level for the logger."""
    self.logger.setLevel(level)

`warning(msg)`

Log a warning message.

Source code in lmm/utils/logging.py

def warning(self, msg: str) -> None:
    """Log a warning message."""
    self.logger.warning(msg)

`FileConsoleLogger`

Bases: LoggerBase

A file logger implementation that uses logging.Logger as a delegate. Logs messages to a specified file using Python's built-in logging module, and relays the messages to the console as well.

This logger allows independent control of logging levels for both file and console outputs.

Source code in lmm/utils/logging.py

class FileConsoleLogger(LoggerBase):
    """
    A file logger implementation that uses logging.Logger as a
    delegate. Logs messages to a specified file using Python's
    built-in logging module, and relays the messages to the console
    as well.

    This logger allows independent control of logging levels for
    both file and console outputs.
    """

    console_logger: LoggerBase

    def __init__(
        self,
        name: str = "",
        log_file: str | Path = "app.log",
        console_level: int = logging.INFO,
        file_level: int = logging.INFO,
    ) -> None:
        """
        Initialize the FileConsoleLogger with a specific logger name,
        file path, and separate logging levels for console and file.

        Args:
            name: The name of the logger, typically __name__ to use
                the module name
            log_file: Path to the log file where messages will be
                written
            console_level: The logging level for console output
                (default: logging.INFO)
            file_level: The logging level for file output
                (default: logging.INFO)
        """
        self.logger = logging.getLogger(f"{name}_file")
        self.logger.setLevel(file_level)

        # Clear any existing handlers to avoid duplicates
        self.logger.handlers.clear()

        # Add file handler
        handler = logging.FileHandler(log_file)
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)

        # Prevent propagation to avoid duplicate logs
        self.logger.propagate = False

        # Delegate for console
        self.console_logger = ConsoleLogger(name)
        self.console_logger.set_level(console_level)

    def set_level(self, level: int) -> None:
        """
        Set the logging level for both file and console loggers.

        Args:
            level: The logging level to set for both outputs
        """
        self.logger.setLevel(level)
        self.console_logger.set_level(level)

    def set_console_level(self, level: int) -> None:
        """
        Set the logging level for the console logger only.

        Args:
            level: The logging level for console output
        """
        self.console_logger.set_level(level)

    def set_file_level(self, level: int) -> None:
        """
        Set the logging level for the file logger only.

        Args:
            level: The logging level for file output
        """
        self.logger.setLevel(level)

    def get_level(self) -> int:
        """
        Get the current logging level for the file logger.

        Returns:
            The file logger's current level
        """
        return self.logger.level

    def get_console_level(self) -> int:
        """
        Get the current logging level for the console logger.

        Returns:
            The console logger's current level
        """
        return self.console_logger.get_level()

    def get_file_level(self) -> int:
        """
        Get the current logging level for the file logger.

        Returns:
            The file logger's current level
        """
        return self.logger.level

    def info(self, msg: str) -> None:
        """Log an informational message."""
        self.logger.info(msg)
        self.console_logger.info(msg)

    def error(self, msg: str) -> None:
        """Log an error message."""
        self.logger.error(msg)
        self.console_logger.error(msg)

    def warning(self, msg: str) -> None:
        """Log a warning message."""
        self.logger.warning(msg)
        self.console_logger.warning(msg)

    def critical(self, msg: str) -> None:
        """Log a critical message."""
        self.logger.critical(msg, stack_info=True)
        self.console_logger.critical(msg)

`init(name='', log_file='app.log', console_level=logging.INFO, file_level=logging.INFO)`

Initialize the FileConsoleLogger with a specific logger name, file path, and separate logging levels for console and file.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the logger, typically name to use the module name	`''`
`log_file`	`str \| Path`	Path to the log file where messages will be written	`'app.log'`
`console_level`	`int`	The logging level for console output (default: logging.INFO)	`INFO`
`file_level`	`int`	The logging level for file output (default: logging.INFO)	`INFO`

Source code in lmm/utils/logging.py

def __init__(
    self,
    name: str = "",
    log_file: str | Path = "app.log",
    console_level: int = logging.INFO,
    file_level: int = logging.INFO,
) -> None:
    """
    Initialize the FileConsoleLogger with a specific logger name,
    file path, and separate logging levels for console and file.

    Args:
        name: The name of the logger, typically __name__ to use
            the module name
        log_file: Path to the log file where messages will be
            written
        console_level: The logging level for console output
            (default: logging.INFO)
        file_level: The logging level for file output
            (default: logging.INFO)
    """
    self.logger = logging.getLogger(f"{name}_file")
    self.logger.setLevel(file_level)

    # Clear any existing handlers to avoid duplicates
    self.logger.handlers.clear()

    # Add file handler
    handler = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s'
    )
    handler.setFormatter(formatter)
    self.logger.addHandler(handler)

    # Prevent propagation to avoid duplicate logs
    self.logger.propagate = False

    # Delegate for console
    self.console_logger = ConsoleLogger(name)
    self.console_logger.set_level(console_level)

`critical(msg)`

Log a critical message.

Source code in lmm/utils/logging.py

def critical(self, msg: str) -> None:
    """Log a critical message."""
    self.logger.critical(msg, stack_info=True)
    self.console_logger.critical(msg)

`error(msg)`

Log an error message.

Source code in lmm/utils/logging.py

def error(self, msg: str) -> None:
    """Log an error message."""
    self.logger.error(msg)
    self.console_logger.error(msg)

`get_console_level()`

Get the current logging level for the console logger.

Returns:

Type	Description
`int`	The console logger's current level

Source code in lmm/utils/logging.py

def get_console_level(self) -> int:
    """
    Get the current logging level for the console logger.

    Returns:
        The console logger's current level
    """
    return self.console_logger.get_level()

`get_file_level()`

Get the current logging level for the file logger.

Returns:

Type	Description
`int`	The file logger's current level

Source code in lmm/utils/logging.py

def get_file_level(self) -> int:
    """
    Get the current logging level for the file logger.

    Returns:
        The file logger's current level
    """
    return self.logger.level

`get_level()`

Get the current logging level for the file logger.

Returns:

Type	Description
`int`	The file logger's current level

Source code in lmm/utils/logging.py

def get_level(self) -> int:
    """
    Get the current logging level for the file logger.

    Returns:
        The file logger's current level
    """
    return self.logger.level

`info(msg)`

Log an informational message.

Source code in lmm/utils/logging.py

def info(self, msg: str) -> None:
    """Log an informational message."""
    self.logger.info(msg)
    self.console_logger.info(msg)

`set_console_level(level)`

Set the logging level for the console logger only.

Parameters:

Name	Type	Description	Default
`level`	`int`	The logging level for console output	required

Source code in lmm/utils/logging.py

def set_console_level(self, level: int) -> None:
    """
    Set the logging level for the console logger only.

    Args:
        level: The logging level for console output
    """
    self.console_logger.set_level(level)

`set_file_level(level)`

Set the logging level for the file logger only.

Parameters:

Name	Type	Description	Default
`level`	`int`	The logging level for file output	required

Source code in lmm/utils/logging.py

def set_file_level(self, level: int) -> None:
    """
    Set the logging level for the file logger only.

    Args:
        level: The logging level for file output
    """
    self.logger.setLevel(level)

`set_level(level)`

Set the logging level for both file and console loggers.

Parameters:

Name	Type	Description	Default
`level`	`int`	The logging level to set for both outputs	required

Source code in lmm/utils/logging.py

def set_level(self, level: int) -> None:
    """
    Set the logging level for both file and console loggers.

    Args:
        level: The logging level to set for both outputs
    """
    self.logger.setLevel(level)
    self.console_logger.set_level(level)

`warning(msg)`

Log a warning message.

Source code in lmm/utils/logging.py

def warning(self, msg: str) -> None:
    """Log a warning message."""
    self.logger.warning(msg)
    self.console_logger.warning(msg)

`FileLogger`

Bases: LoggerBase

A file logger implementation that uses logging.Logger as a delegate. Logs messages to a specified file using Python's built-in logging module.

Source code in lmm/utils/logging.py

class FileLogger(LoggerBase):
    """
    A file logger implementation that uses logging.Logger as a
    delegate. Logs messages to a specified file using Python's
    built-in logging module.
    """

    def __init__(
        self, name: str = "", log_file: str | Path = "app.log"
    ) -> None:
        """
        Initialize the FileLogger with a specific logger name and
        file path.

        Args:
            name: The name of the logger, typically __name__ to use
                the module name
            log_file: Path to the log file where messages will be
                written
        """
        self.logger = logging.getLogger(f"{name}_file")
        self.logger.setLevel(logging.INFO)

        # Clear any existing handlers to avoid duplicates
        self.logger.handlers.clear()

        # Add file handler
        handler = logging.FileHandler(log_file)
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)

        # Prevent propagation to avoid duplicate logs
        self.logger.propagate = False

    def set_level(self, level: int) -> None:
        """Set the logging level for the logger."""
        self.logger.setLevel(level)

    def get_level(self) -> int:
        """Get the current logging level"""
        return self.logger.level

    def info(self, msg: str) -> None:
        """Log an informational message."""
        self.logger.info(msg)

    def error(self, msg: str) -> None:
        """Log an error message."""
        self.logger.error(msg)

    def warning(self, msg: str) -> None:
        """Log a warning message."""
        self.logger.warning(msg)

    def critical(self, msg: str) -> None:
        """Log a critical message."""
        self.logger.critical(msg, stack_info=True)

`init(name='', log_file='app.log')`

Initialize the FileLogger with a specific logger name and file path.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the logger, typically name to use the module name	`''`
`log_file`	`str \| Path`	Path to the log file where messages will be written	`'app.log'`

Source code in lmm/utils/logging.py

def __init__(
    self, name: str = "", log_file: str | Path = "app.log"
) -> None:
    """
    Initialize the FileLogger with a specific logger name and
    file path.

    Args:
        name: The name of the logger, typically __name__ to use
            the module name
        log_file: Path to the log file where messages will be
            written
    """
    self.logger = logging.getLogger(f"{name}_file")
    self.logger.setLevel(logging.INFO)

    # Clear any existing handlers to avoid duplicates
    self.logger.handlers.clear()

    # Add file handler
    handler = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s'
    )
    handler.setFormatter(formatter)
    self.logger.addHandler(handler)

    # Prevent propagation to avoid duplicate logs
    self.logger.propagate = False

`critical(msg)`

Log a critical message.

Source code in lmm/utils/logging.py

def critical(self, msg: str) -> None:
    """Log a critical message."""
    self.logger.critical(msg, stack_info=True)

`error(msg)`

Log an error message.

Source code in lmm/utils/logging.py

def error(self, msg: str) -> None:
    """Log an error message."""
    self.logger.error(msg)

`get_level()`

Get the current logging level

Source code in lmm/utils/logging.py

def get_level(self) -> int:
    """Get the current logging level"""
    return self.logger.level

`info(msg)`

Log an informational message.

Source code in lmm/utils/logging.py

def info(self, msg: str) -> None:
    """Log an informational message."""
    self.logger.info(msg)

`set_level(level)`

Set the logging level for the logger.

Source code in lmm/utils/logging.py

def set_level(self, level: int) -> None:
    """Set the logging level for the logger."""
    self.logger.setLevel(level)

`warning(msg)`

Log a warning message.

Source code in lmm/utils/logging.py

def warning(self, msg: str) -> None:
    """Log a warning message."""
    self.logger.warning(msg)

`LoggerBase`

Bases: ABC

Abstract interface for logging functionality.

Source code in lmm/utils/logging.py

class LoggerBase(ABC):
    """
    Abstract interface for logging functionality.
    """

    @abstractmethod
    def set_level(self, level: int) -> None:
        """Set the logging level for the logger."""
        pass

    @abstractmethod
    def get_level(self) -> int:
        """Get the current logging level"""
        pass

    @abstractmethod
    def info(self, msg: str) -> None:
        """Log an informational message."""
        pass

    @abstractmethod
    def error(self, msg: str) -> None:
        """Log an error message."""
        pass

    @abstractmethod
    def warning(self, msg: str) -> None:
        """Log a warning message."""
        pass

    @abstractmethod
    def critical(self, msg: str) -> None:
        """Log a critical message."""
        pass

`critical(msg)` `abstractmethod`

Log a critical message.

Source code in lmm/utils/logging.py

@abstractmethod
def critical(self, msg: str) -> None:
    """Log a critical message."""
    pass

`error(msg)` `abstractmethod`

Log an error message.

Source code in lmm/utils/logging.py

@abstractmethod
def error(self, msg: str) -> None:
    """Log an error message."""
    pass

`get_level()` `abstractmethod`

Get the current logging level

Source code in lmm/utils/logging.py

@abstractmethod
def get_level(self) -> int:
    """Get the current logging level"""
    pass

`info(msg)` `abstractmethod`

Log an informational message.

Source code in lmm/utils/logging.py

@abstractmethod
def info(self, msg: str) -> None:
    """Log an informational message."""
    pass

`set_level(level)` `abstractmethod`

Set the logging level for the logger.

Source code in lmm/utils/logging.py

@abstractmethod
def set_level(self, level: int) -> None:
    """Set the logging level for the logger."""
    pass

`warning(msg)` `abstractmethod`

Log a warning message.

Source code in lmm/utils/logging.py

@abstractmethod
def warning(self, msg: str) -> None:
    """Log a warning message."""
    pass

`LoglistLogger`

Bases: LoggerBase

Maintains a list of logged errors and warnings that can be inspected by the object creator.

Source code in lmm/utils/logging.py

class LoglistLogger(LoggerBase):
    """
    Maintains a list of logged errors and warnings that can be
    inspected by the object creator.
    """

    def __init__(self) -> None:
        """
        Initialize the logger.
        """
        self.logs: list[dict[str, str]] = []

    def set_level(self, level: int) -> None:
        """Set the logging level for the logger."""
        pass

    def get_level(self) -> int:
        """Get the current logging level"""
        return 0

    def info(self, msg: str) -> None:
        """Log an informational message."""
        self.logs.append({'info': msg})

    def error(self, msg: str) -> None:
        """Log an error message."""
        self.logs.append({'error': msg})

    def warning(self, msg: str) -> None:
        """Log a warning message."""
        self.logs.append({'warning': msg})

    def critical(self, msg: str) -> None:
        """Log a critical message."""
        self.logs.append({'critical': msg})

    def get_logs(self, level: int = 0) -> list[str]:
        """
        Returns a list of strings with the log messages.

        Args:
           level: a filter on the logs. Possible values:
                0 or less: returns all messages
                WARNING or less: omit info
                ERROR or less: omit warning
                CRITICAL or more: only errors and critical
        """
        logs: list[str] = []
        for entry in self.logs:
            match entry:
                case {'info': msg}:
                    if level <= logging.INFO:
                        logs.append("INFO - " + msg)
                case {'warning': msg}:
                    if level <= logging.WARNING:
                        logs.append("WARNING - " + msg)
                case {'error': msg}:
                    if level <= logging.ERROR:
                        logs.append("ERROR - " + msg)
                case {'critical': msg}:
                    logs.append("CRITICAL - " + msg)
                case _:
                    logs.append(str(entry))
        return logs

    def count_logs(self, level: int = 0) -> int:
        """The number of recorded logs. Zero means there
        were no recorded logs."""
        logs = self.get_logs(level)
        return len(logs)

    def clear_logs(self) -> None:
        """Clear the logs from the cache"""
        self.logs.clear()

    def print_logs(self, level: int = 0) -> None:
        logs: list[str] = self.get_logs(level)
        for log in logs:
            print(log)

`init()`

Initialize the logger.

Source code in lmm/utils/logging.py

def __init__(self) -> None:
    """
    Initialize the logger.
    """
    self.logs: list[dict[str, str]] = []

`clear_logs()`

Clear the logs from the cache

Source code in lmm/utils/logging.py

def clear_logs(self) -> None:
    """Clear the logs from the cache"""
    self.logs.clear()

`count_logs(level=0)`

The number of recorded logs. Zero means there were no recorded logs.

Source code in lmm/utils/logging.py

def count_logs(self, level: int = 0) -> int:
    """The number of recorded logs. Zero means there
    were no recorded logs."""
    logs = self.get_logs(level)
    return len(logs)

`critical(msg)`

Log a critical message.

Source code in lmm/utils/logging.py

def critical(self, msg: str) -> None:
    """Log a critical message."""
    self.logs.append({'critical': msg})

`error(msg)`

Log an error message.

Source code in lmm/utils/logging.py

def error(self, msg: str) -> None:
    """Log an error message."""
    self.logs.append({'error': msg})

`get_level()`

Get the current logging level

Source code in lmm/utils/logging.py

def get_level(self) -> int:
    """Get the current logging level"""
    return 0

`get_logs(level=0)`

Returns a list of strings with the log messages.

Parameters:

Name	Type	Description	Default
`level`	`int`	a filter on the logs. Possible values: 0 or less: returns all messages WARNING or less: omit info ERROR or less: omit warning CRITICAL or more: only errors and critical	`0`

Source code in lmm/utils/logging.py

def get_logs(self, level: int = 0) -> list[str]:
    """
    Returns a list of strings with the log messages.

    Args:
       level: a filter on the logs. Possible values:
            0 or less: returns all messages
            WARNING or less: omit info
            ERROR or less: omit warning
            CRITICAL or more: only errors and critical
    """
    logs: list[str] = []
    for entry in self.logs:
        match entry:
            case {'info': msg}:
                if level <= logging.INFO:
                    logs.append("INFO - " + msg)
            case {'warning': msg}:
                if level <= logging.WARNING:
                    logs.append("WARNING - " + msg)
            case {'error': msg}:
                if level <= logging.ERROR:
                    logs.append("ERROR - " + msg)
            case {'critical': msg}:
                logs.append("CRITICAL - " + msg)
            case _:
                logs.append(str(entry))
    return logs

`info(msg)`

Log an informational message.

Source code in lmm/utils/logging.py

def info(self, msg: str) -> None:
    """Log an informational message."""
    self.logs.append({'info': msg})

`set_level(level)`

Set the logging level for the logger.

Source code in lmm/utils/logging.py

def set_level(self, level: int) -> None:
    """Set the logging level for the logger."""
    pass

`warning(msg)`

Log a warning message.

Source code in lmm/utils/logging.py

def warning(self, msg: str) -> None:
    """Log a warning message."""
    self.logs.append({'warning': msg})

`add_file_handler(log_file)`

Add a file handler to the root logger to write logs to a file.

Parameters:

Name	Type	Description	Default
`log_file`	`str \| Path`	Path to the log file	required

Source code in lmm/utils/logging.py

def add_file_handler(log_file: str | Path) -> None:
    """
    Add a file handler to the root logger to write logs to a file.

    Args:
        log_file: Path to the log file
    """
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(
        logging.Formatter(LOG_FORMAT, DATE_FORMAT)
    )
    logging.getLogger().addHandler(file_handler)

`get_logger(name)`

Get a logger with the specified name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the logger, typically name to use the module name	required

Returns:

Type	Description
`LoggerBase`	A configured logger instance

Source code in lmm/utils/logging.py

def get_logger(name: str) -> LoggerBase:
    """
    Get a logger with the specified name.

    Args:
        name: The name of the logger, typically __name__ to use the
            module name

    Returns:
        A configured logger instance
    """
    logger = ConsoleLogger(name)
    return logger

`get_logging_logger(name)`

Get a logger with the specified name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the logger, typically name to use the module name	required

Returns:

Type	Description
`Logger`	A configured logger instance

Source code in lmm/utils/logging.py

def get_logging_logger(name: str) -> logging.Logger:
    """
    Get a logger with the specified name.

    Args:
        name: The name of the logger, typically __name__ to use the
            module name

    Returns:
        A configured logger instance
    """
    logger = logging.Logger(name)
    return logger

`set_log_level(level)`

Set the log level for all loggers.

Parameters:

Name	Type	Description	Default
`level`	`int`	The logging level (e.g., logging.DEBUG, logging.INFO)	required

Source code in lmm/utils/logging.py

def set_log_level(level: int) -> None:
    """
    Set the log level for all loggers.

    Args:
        level: The logging level (e.g., logging.DEBUG, logging.INFO)
    """
    logging.getLogger().setLevel(level)

The utility class LazyLoadingDict stores memoized language model class objects, or indeed objects of any class, produced by a factory function.

The LazyLoadingDict class has three main uses that may be combined.

the first is to create objects based on a definition using a dictionary interface. The key of the dictionary is the definition that provides the object instance; different instances may be created based on the definition
the second is to memoize the objects created by the definition
the third is to enable runtime errors when an invalid definition is given.

The class is instantiated by providing the factory function in the constructor. The factory function takes one argument of the type of the dictionary key, and returns a type that determined the type of the values in the dictionary. To trigger runtime errors when invalid definitions are provided, provide keys of EnumStr of BaseModel-derived types (for example, see the documentation of the class).

`LazyLoadingDict`

Bases: dict[KeyT, ValueT]

A lazy dictionary class with memoized object of type ValueT. To restrict the keys used, use a StrEnum key value (see example below). Any object type may be used as key, depending on how the dictionary is used.

Example:

# We define here permissible keys by inheriting from StrEnum
class LMSource(StrEnum):
    Anthropic = 'Anthropic'
    Gemini = 'Gemini'
    OpenAI = 'OpenAI'

# We then define a factory function that creates a model object
# designated by the key, i.e. a function that maps the possible
# keys to instances that are memoized. In the example, ModelClass
# objects are stored in the dictionary (code not included):
def create_model_instance(model_name: LMSource) -> ModelClass:
    print(f"Created instance of {model_name}")
    return ModelClass(model_name=model_name)

# The lazy dictionary is created by giving the factory function
# in the constructor.
lazy_dict = LazyLoadingDict(create_model_instance)

# The objects are created or retrieved as the value of the key:
openai_model = lazy_dict['OpenAI']

# If the argument of the factory is derived from StrEnum, calling
# the dictionary with an invalid key will throw a ValueError:
model = lazy_dict[LMSource('OpenX')]

This is a more elaborate example, where a whole specification is used to create objects and memoize them:

# This defines the supported model sources. Runtime errors
# provided by BaseModel below
from typing import Literal
from pydantic import BaseModel, ConfigDict

LanguageModelSource = Literal[
        'Anthropic',
        'Gemini',
        'Mistral',
        'OpenAI'
    ]

# This defines source + model
class LanguageModelSpecification(BaseModel):
    source_name: LanguageModelSource
    model_name: str

    # This required to make instances hashable, so that they can
    # be used as keys in the dictionary
    model_config = ConfigDict(frozen=True)


# Langchain model type specified here.
def _create_model_instance(
    model: LanguageModelSpecification,
) -> BaseLM[BaseMsg]:
    # Factory function to create Langchain models while checking
    # permissible sources, provided as key values:

    match model.source_name:
        case LanguageModelSource.OpenAI:
            from langchain_openai.chat_models import ChatOpenAI

            return ChatOpenAI(
                model=model.model_name,
                temperature=0.1,
                max_retries=2,
                use_responses_api=False,
            )
    ... (rest of code not shown)

# The memoized dictionary. langchain_models is parametrized like
# a dict[LanguageModelSpecification, BaseLM[BaseMSg]]
langchain_models = LazyLoadingDict(_create_model_instance)

# Example of use
model_spec = {'source_name': "OpenAI", 'model_name': "gpt-4o"}
model = langchain_models[
    LanguageModelSpecification(**model_spec)
]

A Pydantic model class may also be used to create a more flexible dictionary. In the previous example, only the models specified in LanguageModel source can be specified without raising exceptions. However, a Pydantic model class may be used to constrain the objects saved in the dictionary without limiting them to a finite sets, i.e. by a validation that does not constrain the instances to that set. Thus, if source_name was a str in the above example, then any LanguageModelSpecification constructed with any string will be accepted.

In the following example, the runtime error is generated in the factory function, because literals do not give rise to runtime errors in themselves.

ModelSource = Literal["OpenAI", "Cohere"]

def _model_factory(src: ModelSource) -> ModelClass:
    match src:
        case "OpenAI"
            return ModelClass("OpenAI") # code not shown
        case "Cohere"
            return ModelClass("Cohere") # code not shown
        case _:
            # required to raise error
            raise ValueError(f"Invalid model source: {src}")

model_factory = LazyLoadingDict(_model_factory)

It is also possible to assign to the dictionary directly, thus bypassing the factory function. In this case, the only checks are those that are possibly computed by Pydantic when the object is assigned.

Expected behaviour: may raise ValidationError and ValueErrors.

Source code in lmm/utils/lazy_dict.py

class LazyLoadingDict(dict[KeyT, ValueT]):
    """A lazy dictionary class with memoized object of type ValueT.
    To restrict the keys used, use a StrEnum key value (see example
    below). Any object type may be used as key, depending on how the
    dictionary is used.

    Example:
    ```python
    # We define here permissible keys by inheriting from StrEnum
    class LMSource(StrEnum):
        Anthropic = 'Anthropic'
        Gemini = 'Gemini'
        OpenAI = 'OpenAI'

    # We then define a factory function that creates a model object
    # designated by the key, i.e. a function that maps the possible
    # keys to instances that are memoized. In the example, ModelClass
    # objects are stored in the dictionary (code not included):
    def create_model_instance(model_name: LMSource) -> ModelClass:
        print(f"Created instance of {model_name}")
        return ModelClass(model_name=model_name)

    # The lazy dictionary is created by giving the factory function
    # in the constructor.
    lazy_dict = LazyLoadingDict(create_model_instance)

    # The objects are created or retrieved as the value of the key:
    openai_model = lazy_dict['OpenAI']

    # If the argument of the factory is derived from StrEnum, calling
    # the dictionary with an invalid key will throw a ValueError:
    model = lazy_dict[LMSource('OpenX')]
    ```

    This is a more elaborate example, where a whole specification is
    used to create objects and memoize them:

    ```python
    # This defines the supported model sources. Runtime errors
    # provided by BaseModel below
    from typing import Literal
    from pydantic import BaseModel, ConfigDict

    LanguageModelSource = Literal[
            'Anthropic',
            'Gemini',
            'Mistral',
            'OpenAI'
        ]

    # This defines source + model
    class LanguageModelSpecification(BaseModel):
        source_name: LanguageModelSource
        model_name: str

        # This required to make instances hashable, so that they can
        # be used as keys in the dictionary
        model_config = ConfigDict(frozen=True)


    # Langchain model type specified here.
    def _create_model_instance(
        model: LanguageModelSpecification,
    ) -> BaseLM[BaseMsg]:
        # Factory function to create Langchain models while checking
        # permissible sources, provided as key values:

        match model.source_name:
            case LanguageModelSource.OpenAI:
                from langchain_openai.chat_models import ChatOpenAI

                return ChatOpenAI(
                    model=model.model_name,
                    temperature=0.1,
                    max_retries=2,
                    use_responses_api=False,
                )
        ... (rest of code not shown)

    # The memoized dictionary. langchain_models is parametrized like
    # a dict[LanguageModelSpecification, BaseLM[BaseMSg]]
    langchain_models = LazyLoadingDict(_create_model_instance)

    # Example of use
    model_spec = {'source_name': "OpenAI", 'model_name': "gpt-4o"}
    model = langchain_models[
        LanguageModelSpecification(**model_spec)
    ]
    ```

    A Pydantic model class may also be used to create a more flexible
    dictionary. In the previous example, only the models specified in
    LanguageModel source can be specified without raising exceptions.
    However, a Pydantic model class may be used to constrain the
    objects saved in the dictionary without limiting them to a finite
    sets, i.e. by a validation that does not constrain the instances
    to that set. Thus, if source_name was a str in the above example,
    then any LanguageModelSpecification constructed with any string
    will be accepted.

    In the following example, the runtime error is generated in the
    factory function, because literals do not give rise to runtime
    errors in themselves.

    ```python
    ModelSource = Literal["OpenAI", "Cohere"]

    def _model_factory(src: ModelSource) -> ModelClass:
        match src:
            case "OpenAI"
                return ModelClass("OpenAI") # code not shown
            case "Cohere"
                return ModelClass("Cohere") # code not shown
            case _:
                # required to raise error
                raise ValueError(f"Invalid model source: {src}")

    model_factory = LazyLoadingDict(_model_factory)
    ```

    It is also possible to assign to the dictionary directly, thus
    bypassing the factory function. In this case, the only checks
    are those that are possibly computed by Pydantic when the object
    is assigned.

    Expected behaviour: may raise ValidationError and ValueErrors.
    """

    def __init__(
        self,
        key_creator_func: Callable[[KeyT], ValueT],
        destructor_func: Callable[[ValueT], None] | None = None,
    ):
        super().__init__()
        self._key_creator_func = key_creator_func
        self._destructor_func = destructor_func

    def _destroy_value(self, value: ValueT) -> None:
        """Helper to destroy a value using the configured strategy."""
        if self._destructor_func:
            self._destructor_func(value)
        elif hasattr(value, "close") and callable(value.close): # type: ignore (self-reflection)
            value.close()  # type: ignore (checked)
        elif hasattr(value, "dispose") and callable(value.dispose): # type: ignore (self-reflection)
            value.dispose() # type: ignore (checked)

    def __getitem__(self, key: KeyT) -> ValueT:
        # Check if the value is already cached
        if key in self:
            return super().__getitem__(key)

        # Lazy-load the data, cache it, and return
        value: ValueT = self._key_creator_func(key)
        super().__setitem__(key, value)
        return value

    def __setitem__(self, key: KeyT, value: ValueT) -> None:
        """Allow direct setting of key/value pairs.

        This bypasses the factory function for the given key.
        Once set directly, the factory function will not be called
        for this key unless the key is deleted first.

        Raises:
            ValueError: If the key already exists in the dictionary.
        """
        if key in self:
            raise ValueError(f"Key '{key}' already exists. Delete it first to overwrite.")
        super().__setitem__(key, value)

    def __delitem__(self, key: KeyT) -> None:
        if key in self:
            value: ValueT = super().__getitem__(key)
            self._destroy_value(value)
        super().__delitem__(key)

    def clear(self) -> None:
        for value in list(self.values()):
            self._destroy_value(value)
        super().clear()

    def __del__(self) -> None:
        # We need to be careful here during interpreter shutdown
        try:
            self.clear()
        except Exception:
            # Suppress errors during destruction to avoid noise
            pass

`setitem(key, value)`

Allow direct setting of key/value pairs.

This bypasses the factory function for the given key. Once set directly, the factory function will not be called for this key unless the key is deleted first.

Raises:

Type	Description
`ValueError`	If the key already exists in the dictionary.

Source code in lmm/utils/lazy_dict.py

def __setitem__(self, key: KeyT, value: ValueT) -> None:
    """Allow direct setting of key/value pairs.

    This bypasses the factory function for the given key.
    Once set directly, the factory function will not be called
    for this key unless the key is deleted first.

    Raises:
        ValueError: If the key already exists in the dictionary.
    """
    if key in self:
        raise ValueError(f"Key '{key}' already exists. Delete it first to overwrite.")
    super().__setitem__(key, value)

`apply_markdown_heuristics(page_text)`

Applies simple heuristics to convert extracted raw text into basic Markdown format.

This function attempts to: 1. Clean up excessive whitespace. 2. Ensure proper paragraph separation (Markdown requires two newlines). 3. (Placeholder for advanced logic) Detect headings or lists based on patterns.

Source code in lmm/utils/importpdfs.py

def apply_markdown_heuristics(page_text: str) -> str:
    """
    Applies simple heuristics to convert extracted raw text into basic Markdown format.

    This function attempts to:
    1. Clean up excessive whitespace.
    2. Ensure proper paragraph separation (Markdown requires two newlines).
    3. (Placeholder for advanced logic) Detect headings or lists based on patterns.
    """
    # 1. Normalize line endings and cleanup extra spaces
    lines = page_text.strip().split('\n')

    markdown_lines: list[str] = []

    # Simple logic: assume lines separated by only one newline are part of the
    # same paragraph, and lines separated by blank lines are new paragraphs.
    current_paragraph: list[str] = []

    for line in lines:
        stripped_line = line.strip()

        if not stripped_line:
            # End of a paragraph block, join and add to markdown_lines
            if current_paragraph:
                markdown_lines.append(" ".join(current_paragraph))
                current_paragraph = []
            # Add an extra newline for Markdown paragraph separation
            markdown_lines.append("")
        else:
            # Simple list/heading detection placeholder
            if stripped_line.startswith(('1.', 'a.', '*', '-')):
                # If it looks like a list item, treat it as a new line item
                if current_paragraph:
                    markdown_lines.append(" ".join(current_paragraph))
                    current_paragraph = []
                markdown_lines.append(stripped_line)
            else:
                # Part of the current paragraph
                current_paragraph.append(stripped_line)

    # Add the last pending paragraph
    if current_paragraph:
        markdown_lines.append(" ".join(current_paragraph))

    return "\n".join(markdown_lines).strip()

`convert_folder_to_markdown(input_dir, output_dir)`

Reads all PDF files from an input directory and converts them to Markdown in an output directory.

Parameters:

Name	Type	Description	Default
`input_dir`	`str`	The path to the folder containing PDF files.	required
`output_dir`	`str`	The path where the Markdown files will be saved.	required

Source code in lmm/utils/importpdfs.py

def convert_folder_to_markdown(input_dir: str, output_dir: str):
    """
    Reads all PDF files from an input directory and converts them to Markdown
    in an output directory.

    Args:
        input_dir: The path to the folder containing PDF files.
        output_dir: The path where the Markdown files will be saved.
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    if not input_path.is_dir():
        print(f"Error: Input directory not found at '{input_dir}'")
        return

    # Create the output directory if it does not exist
    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Output directory ensured: {output_path}")

    # Find all PDF files in the input directory
    pdf_files = list(input_path.glob("*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in '{input_dir}'.")
        return

    print(f"Found {len(pdf_files)} PDF(s) to process.")

    for pdf_file in pdf_files:
        convert_pdf_to_md(pdf_file, output_path)

    print("\nProcessing complete.")

`convert_pdf_to_md(pdf_path, output_dir)`

Converts a single PDF file into a Markdown file.

Parameters:

Name	Type	Description	Default
`pdf_path`	`Path`	Path object to the input PDF file.	required
`output_dir`	`Path`	Path object for the output directory.	required

Source code in lmm/utils/importpdfs.py

def convert_pdf_to_md(pdf_path: Path, output_dir: Path) -> None:
    """
    Converts a single PDF file into a Markdown file.

    Args:
        pdf_path: Path object to the input PDF file.
        output_dir: Path object for the output directory.
    """
    print(f"Processing: {pdf_path.name}")

    markdown_content: list[str] = []
    output_filename = pdf_path.stem + ".md"
    output_path = output_dir / output_filename

    # Initialize default LAParams to fix the "unpack requires a buffer..." error
    # By explicitly passing this object, we prevent pdfminer.six from performing
    # an internal initialization step that fails on some PDFs.
    default_laparams = LAParams()

    try:
        # Pass the initialized default_laparams to pdfplumber.open()
        with pdfplumber.open(
            pdf_path,
            laparams=default_laparams.__dict__,
            repair=True,
            repair_setting="default",
            gs_path="C:/Program Files/gs/gs10.06.0/bin/gswin64c.exe",
        ) as pdf:
            total_pages = len(pdf.pages)

            for i, page in enumerate(pdf.pages):
                # Extract text retaining layout structure (via 'layout' argument)
                raw_text = page.extract_text(
                    x_tolerance=2, y_tolerance=2, layout=True
                )

                if raw_text:
                    # Apply markdown formatting heuristics
                    formatted_text = apply_markdown_heuristics(
                        raw_text
                    )
                    markdown_content.append(formatted_text)

                # Insert a metadata block between pages to track content
                if i < total_pages - 1:
                    markdown_content.append(
                        f"\n\n---\npage: {i + 1}\n"
                        f"total_pages: {total_pages}\n---\n\n"
                    )

        # Write the final content to the Markdown file
        output_path.write_text(
            "\n".join(markdown_content), encoding="utf-8"
        )
        print(f"Successfully converted to: {output_path}")

    except Exception as e:
        print(
            f"ERROR: Failed to process {pdf_path.name}. Reason: {e}"
        )

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search

Utilities

base_hash(input_string)

generate_random_string(length=18)

generate_uuid(text_input, namespace_uuid=uuid.NAMESPACE_URL)

append_postfix_to_filename(filename, postfix)

check_allowed_content(input_string, allowed_list)

clean_text_concat(text_segments)

create_interface(f, argv)

list_files_with_extensions(folder_path, extensions)

parse_external_boolean(value)

process_string_quotes(input_string)

string_to_path_or_string(input_string)

validate_file(source, logger=logger)

ConsoleLogger

__init__(name=None)

critical(msg)

error(msg)

get_level()

info(msg)

set_level(level)

warning(msg)

ExceptionConsoleLogger

__init__(name='')

critical(msg)

error(msg)

get_level()

info(msg)

set_level(level)

warning(msg)

FileConsoleLogger

__init__(name='', log_file='app.log', console_level=logging.INFO, file_level=logging.INFO)

critical(msg)

error(msg)

get_console_level()

get_file_level()

get_level()

info(msg)

set_console_level(level)

set_file_level(level)

set_level(level)

warning(msg)

FileLogger

__init__(name='', log_file='app.log')

critical(msg)

error(msg)

get_level()

info(msg)

set_level(level)

warning(msg)

LoggerBase

critical(msg) abstractmethod

error(msg) abstractmethod

get_level() abstractmethod

info(msg) abstractmethod

set_level(level) abstractmethod

warning(msg) abstractmethod

LoglistLogger

__init__()

clear_logs()

count_logs(level=0)

critical(msg)

error(msg)

get_level()

get_logs(level=0)

info(msg)

set_level(level)

warning(msg)

add_file_handler(log_file)

get_logger(name)

get_logging_logger(name)

set_log_level(level)

LazyLoadingDict

__setitem__(key, value)

apply_markdown_heuristics(page_text)

convert_folder_to_markdown(input_dir, output_dir)

convert_pdf_to_md(pdf_path, output_dir)

`base_hash(input_string)`

`generate_random_string(length=18)`

`generate_uuid(text_input, namespace_uuid=uuid.NAMESPACE_URL)`

`append_postfix_to_filename(filename, postfix)`

`check_allowed_content(input_string, allowed_list)`

`clean_text_concat(text_segments)`

`create_interface(f, argv)`

`list_files_with_extensions(folder_path, extensions)`

`parse_external_boolean(value)`

`process_string_quotes(input_string)`

`string_to_path_or_string(input_string)`

`validate_file(source, logger=logger)`

`ConsoleLogger`

`init(name=None)`

`critical(msg)`

`error(msg)`

`get_level()`

`info(msg)`

`set_level(level)`

`warning(msg)`

`ExceptionConsoleLogger`

`init(name='')`

`critical(msg)`

`error(msg)`

`get_level()`

`info(msg)`

`set_level(level)`

`warning(msg)`

`FileConsoleLogger`

`init(name='', log_file='app.log', console_level=logging.INFO, file_level=logging.INFO)`

`critical(msg)`

`error(msg)`

`get_console_level()`

`get_file_level()`

`get_level()`

`info(msg)`

`set_console_level(level)`

`set_file_level(level)`

`set_level(level)`

`warning(msg)`

`FileLogger`

`init(name='', log_file='app.log')`

`critical(msg)`

`error(msg)`

`get_level()`

`info(msg)`

`set_level(level)`

`warning(msg)`

`LoggerBase`

`critical(msg)` `abstractmethod`

`error(msg)` `abstractmethod`

`get_level()` `abstractmethod`

`info(msg)` `abstractmethod`

`set_level(level)` `abstractmethod`

`warning(msg)` `abstractmethod`

`LoglistLogger`

`init()`

`clear_logs()`

`count_logs(level=0)`

`critical(msg)`

`error(msg)`

`get_level()`

`get_logs(level=0)`

`info(msg)`

`set_level(level)`

`warning(msg)`

`add_file_handler(log_file)`

`get_logger(name)`

`get_logging_logger(name)`

`set_log_level(level)`

`LazyLoadingDict`

`setitem(key, value)`

`apply_markdown_heuristics(page_text)`

`convert_folder_to_markdown(input_dir, output_dir)`

`convert_pdf_to_md(pdf_path, output_dir)`