Source code for magpie.datamodel

from __future__ import annotations

import uuid
from datetime import datetime

import msgspec
from msgspec import Struct, StructMeta, field

# NOTE: most of the utility methods are defined outside of this file to keep the
#       data model definitions as simple as possible.
#       However, to have auto-completion working we still need to keep empty stubs
#       for these methods.


PathSegment = str | int | tuple[int, str]
"""Objects deriving from our `Base` struct also implement path traversal using one of
the following ways to go through an object:

- fragment is a `str` and starts with a leading dot '.' => we return the attribute of the object
  which name is the fragment with the leading dot removed
- fragment is an `int` => we return `obj[frag]`
- fragment is a `str` => we return `obj[frag]`
"""

Path = list[PathSegment]
"""A `Path` consists of a list of `PathSegment`."""



[docs]
class FormatSpec(msgspec.Struct, forbid_unknown_fields=True):
    space: str
    newline: str
    start_bold: str
    end_bold: str




[docs]
class TaggedStructMeta(StructMeta):
    """Metaclass that automatically will tag the class being instantiated with the name
    of the module it is defined in.

    This will be useful for the `Fetcher` plugins so we don't have to specify a tag
    manually, which they need to be able to be embedded into the tagged union type.

    This will only be applied to classes defined in submodule of `magpie.fetchers`.
    """
    def __new__(mcls, name, bases, namespace, **struct_config):
        module = namespace['__module__'].split('.')
        # only set the tag for classes in submodules of `magpie.fetchers`
        if len(module) == 3 and module[:2] == ['magpie', 'fetchers']:
            struct_config['tag'] = module[2]

        return super().__new__(mcls, name, bases, namespace, **struct_config)




[docs]
class Base(
    msgspec.Struct,
    metaclass=TaggedStructMeta,
    omit_defaults=True,
    forbid_unknown_fields=True,
):
    """A base class holding some common settings.

    - We set ``omit_defaults = True`` to omit any fields containing only their
      default value from the output when encoding.
    - We set ``forbid_unknown_fields = True`` to error nicely if an unknown
      field is present in the serialized data. This helps catch typo errors
      early.
    """

    def _format(self, fmt: FormatSpec, level: int = 0) -> str: ...

    TEXT_FORMAT = FormatSpec(space=' ', newline='\n', start_bold='', end_bold='')
    ANSI_FORMAT = FormatSpec(space=' ', newline='\n', start_bold='\033[1m', end_bold='\033[22m')
    HTML_FORMAT = FormatSpec(space='&nbsp;', newline='<br>', start_bold='<b>', end_bold='</b>')


[docs]
    def as_plain_text(self) -> str:
        """Return an unstyled text representation suitable for outputting to a file."""
        return self._format(Base.TEXT_FORMAT)



[docs]
    def as_text(self) -> str:
        """Return a text representation suitable for printing in a terminal."""
        return self._format(Base.ANSI_FORMAT)



[docs]
    def as_html(self) -> str:
        """Return an HTML representation."""
        return self._format(Base.HTML_FORMAT)



[docs]
    def to_dict(self):
        """Return the object as dictionary of builtin types."""
        return msgspec.to_builtins(self)


    def follow(self, path: Path | PathSegment) -> Base:
        """Follow the given path or path segment starting from `self` and return the resulting object."""
        ...



# having kw_only=True allows us to inherit this class and still have positional args
# see: https://jcristharif.com/msgspec/structs.html#field-ordering

[docs]
class WithID(Base, kw_only=True):
    """Use this base class for defining models that need to have a `uuid` field."""
    uuid: uuid.UUID = field(default_factory=uuid.uuid4)



################################################################################
##                                                                            ##
##   Types of information                                                     ##
##                                                                            ##
################################################################################


UrlInformation = None
"""This class represents the information that can be extracted from the
URL alone.

It is initially defined as `None` but will be created dynamically when loading
the fetcher plugins to be the union of all of the fetchers `Info` classes. It
allows us to fully import the `datamodel` module and make it available to plugins.

This is fine as annotations are evaluated lazily and will only be required to
be correct when validating structs, and not when importing modules.
"""



[docs]
class ContentInformationBase(Base, tag='contentbase'):
    """This class represents the additional information that can be fetched
    for this URL, by downloading and/or parsing its contents.

    It contains at least the `data` field, which is the content pointed
    at by this URL (html code most of the time, but could be binary,
    e.g.: for PDFs, images, etc.)
    """
    data: str | None  # | bytes  # FIXME: `str | bytes` not supported by msgspec



ContentInformation = ContentInformationBase
"""This class represents the additional information that can be fetched
for this URL, by downloading and/or parsing its contents.

It is initially defined as `ContentInformationBase` but will be created dynamically
when loading the fetcher plugins to be the union of all of the fetchers `Content` classes.
It allows us to fully import the `datamodel` module and make it available to plugins.

This is fine as annotations are evaluated lazily and will only be required to
be correct when validating structs, and not when importing modules.
"""



[docs]
class SemanticModel(Base):
    # supposes an external enum defines the models - FIXME: `enum` instead of `str`
    model: str
    # template string for LLM models, None for embedding models
    prompt: str | None
    # should be compatible with the supported backend (ATM llama_cpp.Llama for LLMs,
    # sentence_transformers.SentenceTransformer for embedding)
    settings: dict | None




[docs]
class SemanticBaseInformation(Struct):
    model: SemanticModel




[docs]
class SemanticTags(SemanticBaseInformation):
    content: list[str]




[docs]
class SemanticSummary(SemanticBaseInformation):
    content: str




[docs]
class SemanticEmbedding(SemanticBaseInformation):
    content: list[float]  # better: some type of vector[float], numpy.array, torch.Tensor




[docs]
class SemanticInformation(Struct):
    tags: SemanticTags | None
    summary: SemanticSummary | None
    embedding: SemanticEmbedding | None



################################################################################
##                                                                            ##
##   URL and Twig models                                                      ##
##                                                                            ##
################################################################################


[docs]
class Url(WithID):
    """A URL with optional additional information."""
    value: str  # TODO: validate it's a proper url
    url_type: str | None = None

    info: UrlInformation | None = None
    content: ContentInformation | None = None
    semantic: SemanticInformation | None = None

    def __str__(self) -> str:
        if self.info is not None:
            return f"Url('{self.value}', type={self.url_type})"
        return self.value

    def __repr__(self) -> str:
        return str(self)




[docs]
class TimestampMixin(Base, kw_only=True):
    # FIXME: when importing bookmarks from other format, we would probably like to keep the added created_at timestamp
    created_at: datetime = field(default_factory=lambda: datetime.now().astimezone())
    updated_at: datetime = field(default_factory=lambda: datetime.now().astimezone())



# class Twig(TimestampMixin, tag=True):

[docs]
class Twig(WithID, tag=True):
    """The `Twig` class is the main data item in MagPie.

    It can be thought of as an augmented bookmark.
    Instead of pointing to a single url, it stands more generally for an item
    of interest, usually a webpage but possibly more, that we want to remember.

    For example, a web page we want to bookmark could be associated with others
    that are semantically related, such as HackerNews/Reddit discussions,
    GitHub readme page for a software project, etc.
    """

    title: str
    url: Url
    related: list[Url] = []

    rating: int = 0  # 1-5 stars
    tags: list[str] = []
    notes: str | None = None  # user notes/comments




[docs]
class Folder(WithID, tag=True):
    """A Folder has a name and a list of sub-folders and `Twig` contained inside it.

    It is iterable and indexable, both by item position and object name, ie:
    ```
    folder = Folder()  # add some more elements into it...
    folder.add(Twig(title='magpie website', url=Url('https://magpie.digitalgaia.net')))
    first = folder[0]  # this works
    magpie = folder['magpie website']  # this works too
    ```
    """

    name: str
    items: list[Folder | Twig] = []

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, index: int | str) -> Folder | Twig:
        if isinstance(index, str):
            for obj in self.items:
                if match_name(obj, index):
                    return obj
            raise IndexError(f'Could not find Folder/Twig with the name/title: "{index}"')
        return self.items[index]

    def __iter__(self):
        return iter(self.items)


[docs]
    def add(self, item: Folder | Twig):
        self.items.append(item)



[docs]
    def remove(self, index: int | str):
        print(type(index))
        if isinstance(index, int):
            del self.items[index]
            return
        for i, item in enumerate(self.items):
            if match_name(item, index):
                del self.items[i]
                return
        raise ValueError(f'Could not delete folder/twig with name: {index}')



[docs]
    @staticmethod
    def from_urls(urls: list[str]) -> Folder:
        """Build a new `Folder` with the given list of URLs str."""
        result = Folder(name='root')
        result.items = [Twig(title=f'Web page at {url if isinstance(url, str) else url.value}',
                             url=Url(url) if isinstance(url, str) else url)
                        for url in urls]
        return result


    def find(self, uuid: uuid.UUID) -> WithID:
        """Return the object reachable from this folder with the given UUID.

        Raises:
            ValueError: if no such object could be found
        """
        ...

    def find_path(self,
                  obj: WithID | None = None,
                  uuid: uuid.UUID | None = None,
                  as_str: bool = False) -> Path:
        """Return a path to the given object, or the object identified by its UUID.
        You need to specify either `obj` or `uuid` but not both.

        Args:
            as_str: if true, return the PathSegments as str (name/title) when
                    applicable, or as integers otherwise

        Raises:
            ValueError: if both or none of `obj` and `uuid` is specified
            ValueError: if no such object could be found
        """
        ...

    def iter_twigs(self: Folder, *, depth_first=True): ...
    def iter_urls(self: Folder, *, depth_first=True): ...
    def iter_tree(self: Folder, *, depth_first=True, depth=0): ...
    def iter_with_id(self: Folder, path=None): ...




[docs]
def match_name(obj: Twig | Folder, name: str) -> bool:
    return ((isinstance(obj, Folder) and obj.name == name) or
            (isinstance(obj, Twig) and obj.title == name))



################################################################################
##                                                                            ##
##   DataFetcher                                                              ##
##                                                                            ##
################################################################################


[docs]
class DataFetcher:
    """The DataFetcher class is the base interface for all plugins that want
    to fetch specific content.

    They need to at least implement the `match()` method to declare whether they
    can handle a certain URL.
    """

[docs]
    @classmethod
    def name(cls):
        """Return an identifying name for this DataFetcher. It is extracted as
        the last part of the module name in which this class is defined."""
        return cls.__module__.split('.')[-1]



[docs]
    def match(self, url: Url) -> bool:
        """Return whether this DataFetcher is able to extract more
        information from the given URL."""
        raise NotImplementedError



[docs]
    def extract_info(self, url: Url) -> UrlInformation:
        """Extract semantic information purely from the URL, without
        having to download or analyze any more resources.
        This is typically done using regex matching and should be cheap
        to compute.
        You can assume that `self.match(url) is True` if this method is called.

        Examples:
            - a GitHub data fetcher would return (org_name, repo_name),
            - a HackerNews data fetcher would return the discussion ID.
        """
        pass



[docs]
    def fetch_additional_info(self, url: Url) -> ContentInformation:
        """Extract more information from the given URL, usually by
        downloading and parsing some additional content or related
        pages.

        Examples:
            - a GitHub data fetcher would download the Readme file
              of a project
            - a HackerNews fetcher would download comments from a
              discussion and store them as a tree of comments, which
              are tuples of (author, date, comment)
        """
        pass



[docs]
    def extract_semantic_info(self, url: Url) -> SemanticInformation:
        """Use all the information we have about this URL (extracted
        using the other methods) and feed that to an LLM to extract
        semantic information.

        This includes: summary, tags, embedding, etc.
        """
        pass



[docs]
    def expand_data(self, folder: Folder):
        """Try to expand the database by passing the whole folder to
        each registered fetcher and asking them if they can do it.

        This allows the fetchers to not only find more information for
        a single URL, but to manipulate the whole database in order to
        reorganize it in case they need to.

        It is a good idea to call `DataRetriever.identify(folder)` and
        `DataRetriever.fetch(folder)` after this in order to get content for
        additional twigs that might have been created.
        """
        pass