Source code for magpie.datamodel

from __future__ import annotations

import uuid
from datetime import datetime

import msgspec
from msgspec import Struct, StructMeta, field

# NOTE: most of the utility methods are defined outside of this file to keep the
#       data model definitions as simple as possible.
#       However, to have auto-completion working we still need to keep empty stubs
#       for these methods.


PathSegment = str | int | tuple[int, str]
"""Objects deriving from our `Base` struct also implement path traversal using one of
the following ways to go through an object:

- fragment is a `str` and starts with a leading dot '.' => we return the attribute of the object
  which name is the fragment with the leading dot removed
- fragment is an `int` => we return `obj[frag]`
- fragment is a `str` => we return `obj[frag]`
"""

Path = list[PathSegment]
"""A `Path` consists of a list of `PathSegment`."""


[docs] class FormatSpec(msgspec.Struct, forbid_unknown_fields=True): space: str newline: str start_bold: str end_bold: str
[docs] class TaggedStructMeta(StructMeta): """Metaclass that automatically will tag the class being instantiated with the name of the module it is defined in. This will be useful for the `Fetcher` plugins so we don't have to specify a tag manually, which they need to be able to be embedded into the tagged union type. This will only be applied to classes defined in submodule of `magpie.fetchers`. """ def __new__(mcls, name, bases, namespace, **struct_config): module = namespace['__module__'].split('.') # only set the tag for classes in submodules of `magpie.fetchers` if len(module) == 3 and module[:2] == ['magpie', 'fetchers']: struct_config['tag'] = module[2] return super().__new__(mcls, name, bases, namespace, **struct_config)
[docs] class Base( msgspec.Struct, metaclass=TaggedStructMeta, omit_defaults=True, forbid_unknown_fields=True, ): """A base class holding some common settings. - We set ``omit_defaults = True`` to omit any fields containing only their default value from the output when encoding. - We set ``forbid_unknown_fields = True`` to error nicely if an unknown field is present in the serialized data. This helps catch typo errors early. """ def _format(self, fmt: FormatSpec, level: int = 0) -> str: ... TEXT_FORMAT = FormatSpec(space=' ', newline='\n', start_bold='', end_bold='') ANSI_FORMAT = FormatSpec(space=' ', newline='\n', start_bold='\033[1m', end_bold='\033[22m') HTML_FORMAT = FormatSpec(space='&nbsp;', newline='<br>', start_bold='<b>', end_bold='</b>')
[docs] def as_plain_text(self) -> str: """Return an unstyled text representation suitable for outputting to a file.""" return self._format(Base.TEXT_FORMAT)
[docs] def as_text(self) -> str: """Return a text representation suitable for printing in a terminal.""" return self._format(Base.ANSI_FORMAT)
[docs] def as_html(self) -> str: """Return an HTML representation.""" return self._format(Base.HTML_FORMAT)
[docs] def to_dict(self): """Return the object as dictionary of builtin types.""" return msgspec.to_builtins(self)
def follow(self, path: Path | PathSegment) -> Base: """Follow the given path or path segment starting from `self` and return the resulting object.""" ...
# having kw_only=True allows us to inherit this class and still have positional args # see: https://jcristharif.com/msgspec/structs.html#field-ordering
[docs] class WithID(Base, kw_only=True): """Use this base class for defining models that need to have a `uuid` field.""" uuid: uuid.UUID = field(default_factory=uuid.uuid4)
################################################################################ ## ## ## Types of information ## ## ## ################################################################################ UrlInformation = None """This class represents the information that can be extracted from the URL alone. It is initially defined as `None` but will be created dynamically when loading the fetcher plugins to be the union of all of the fetchers `Info` classes. It allows us to fully import the `datamodel` module and make it available to plugins. This is fine as annotations are evaluated lazily and will only be required to be correct when validating structs, and not when importing modules. """
[docs] class ContentInformationBase(Base, tag='contentbase'): """This class represents the additional information that can be fetched for this URL, by downloading and/or parsing its contents. It contains at least the `data` field, which is the content pointed at by this URL (html code most of the time, but could be binary, e.g.: for PDFs, images, etc.) """ data: str | None # | bytes # FIXME: `str | bytes` not supported by msgspec
ContentInformation = ContentInformationBase """This class represents the additional information that can be fetched for this URL, by downloading and/or parsing its contents. It is initially defined as `ContentInformationBase` but will be created dynamically when loading the fetcher plugins to be the union of all of the fetchers `Content` classes. It allows us to fully import the `datamodel` module and make it available to plugins. This is fine as annotations are evaluated lazily and will only be required to be correct when validating structs, and not when importing modules. """
[docs] class SemanticModel(Base): # supposes an external enum defines the models - FIXME: `enum` instead of `str` model: str # template string for LLM models, None for embedding models prompt: str | None # should be compatible with the supported backend (ATM llama_cpp.Llama for LLMs, # sentence_transformers.SentenceTransformer for embedding) settings: dict | None
[docs] class SemanticBaseInformation(Struct): model: SemanticModel
[docs] class SemanticTags(SemanticBaseInformation): content: list[str]
[docs] class SemanticSummary(SemanticBaseInformation): content: str
[docs] class SemanticEmbedding(SemanticBaseInformation): content: list[float] # better: some type of vector[float], numpy.array, torch.Tensor
[docs] class SemanticInformation(Struct): tags: SemanticTags | None summary: SemanticSummary | None embedding: SemanticEmbedding | None
################################################################################ ## ## ## URL and Twig models ## ## ## ################################################################################
[docs] class Url(WithID): """A URL with optional additional information.""" value: str # TODO: validate it's a proper url url_type: str | None = None info: UrlInformation | None = None content: ContentInformation | None = None semantic: SemanticInformation | None = None def __str__(self) -> str: if self.info is not None: return f"Url('{self.value}', type={self.url_type})" return self.value def __repr__(self) -> str: return str(self)
[docs] class TimestampMixin(Base, kw_only=True): # FIXME: when importing bookmarks from other format, we would probably like to keep the added created_at timestamp created_at: datetime = field(default_factory=lambda: datetime.now().astimezone()) updated_at: datetime = field(default_factory=lambda: datetime.now().astimezone())
# class Twig(TimestampMixin, tag=True):
[docs] class Twig(WithID, tag=True): """The `Twig` class is the main data item in MagPie. It can be thought of as an augmented bookmark. Instead of pointing to a single url, it stands more generally for an item of interest, usually a webpage but possibly more, that we want to remember. For example, a web page we want to bookmark could be associated with others that are semantically related, such as HackerNews/Reddit discussions, GitHub readme page for a software project, etc. """ title: str url: Url related: list[Url] = [] rating: int = 0 # 1-5 stars tags: list[str] = [] notes: str | None = None # user notes/comments
[docs] class Folder(WithID, tag=True): """A Folder has a name and a list of sub-folders and `Twig` contained inside it. It is iterable and indexable, both by item position and object name, ie: ``` folder = Folder() # add some more elements into it... folder.add(Twig(title='magpie website', url=Url('https://magpie.digitalgaia.net'))) first = folder[0] # this works magpie = folder['magpie website'] # this works too ``` """ name: str items: list[Folder | Twig] = [] def __len__(self) -> int: return len(self.items) def __getitem__(self, index: int | str) -> Folder | Twig: if isinstance(index, str): for obj in self.items: if match_name(obj, index): return obj raise IndexError(f'Could not find Folder/Twig with the name/title: "{index}"') return self.items[index] def __iter__(self): return iter(self.items)
[docs] def add(self, item: Folder | Twig): self.items.append(item)
[docs] def remove(self, index: int | str): print(type(index)) if isinstance(index, int): del self.items[index] return for i, item in enumerate(self.items): if match_name(item, index): del self.items[i] return raise ValueError(f'Could not delete folder/twig with name: {index}')
[docs] @staticmethod def from_urls(urls: list[str]) -> Folder: """Build a new `Folder` with the given list of URLs str.""" result = Folder(name='root') result.items = [Twig(title=f'Web page at {url if isinstance(url, str) else url.value}', url=Url(url) if isinstance(url, str) else url) for url in urls] return result
def find(self, uuid: uuid.UUID) -> WithID: """Return the object reachable from this folder with the given UUID. Raises: ValueError: if no such object could be found """ ... def find_path(self, obj: WithID | None = None, uuid: uuid.UUID | None = None, as_str: bool = False) -> Path: """Return a path to the given object, or the object identified by its UUID. You need to specify either `obj` or `uuid` but not both. Args: as_str: if true, return the PathSegments as str (name/title) when applicable, or as integers otherwise Raises: ValueError: if both or none of `obj` and `uuid` is specified ValueError: if no such object could be found """ ... def iter_twigs(self: Folder, *, depth_first=True): ... def iter_urls(self: Folder, *, depth_first=True): ... def iter_tree(self: Folder, *, depth_first=True, depth=0): ... def iter_with_id(self: Folder, path=None): ...
[docs] def match_name(obj: Twig | Folder, name: str) -> bool: return ((isinstance(obj, Folder) and obj.name == name) or (isinstance(obj, Twig) and obj.title == name))
################################################################################ ## ## ## DataFetcher ## ## ## ################################################################################
[docs] class DataFetcher: """The DataFetcher class is the base interface for all plugins that want to fetch specific content. They need to at least implement the `match()` method to declare whether they can handle a certain URL. """
[docs] @classmethod def name(cls): """Return an identifying name for this DataFetcher. It is extracted as the last part of the module name in which this class is defined.""" return cls.__module__.split('.')[-1]
[docs] def match(self, url: Url) -> bool: """Return whether this DataFetcher is able to extract more information from the given URL.""" raise NotImplementedError
[docs] def extract_info(self, url: Url) -> UrlInformation: """Extract semantic information purely from the URL, without having to download or analyze any more resources. This is typically done using regex matching and should be cheap to compute. You can assume that `self.match(url) is True` if this method is called. Examples: - a GitHub data fetcher would return (org_name, repo_name), - a HackerNews data fetcher would return the discussion ID. """ pass
[docs] def fetch_additional_info(self, url: Url) -> ContentInformation: """Extract more information from the given URL, usually by downloading and parsing some additional content or related pages. Examples: - a GitHub data fetcher would download the Readme file of a project - a HackerNews fetcher would download comments from a discussion and store them as a tree of comments, which are tuples of (author, date, comment) """ pass
[docs] def extract_semantic_info(self, url: Url) -> SemanticInformation: """Use all the information we have about this URL (extracted using the other methods) and feed that to an LLM to extract semantic information. This includes: summary, tags, embedding, etc. """ pass
[docs] def expand_data(self, folder: Folder): """Try to expand the database by passing the whole folder to each registered fetcher and asking them if they can do it. This allows the fetchers to not only find more information for a single URL, but to manipulate the whole database in order to reorganize it in case they need to. It is a good idea to call `DataRetriever.identify(folder)` and `DataRetriever.fetch(folder)` after this in order to get content for additional twigs that might have been created. """ pass