from __future__ import annotations
import uuid
from datetime import datetime
import msgspec
from msgspec import Struct, StructMeta, field
# NOTE: most of the utility methods are defined outside of this file to keep the
# data model definitions as simple as possible.
# However, to have auto-completion working we still need to keep empty stubs
# for these methods.
PathSegment = str | int | tuple[int, str]
"""Objects deriving from our `Base` struct also implement path traversal using one of
the following ways to go through an object:
- fragment is a `str` and starts with a leading dot '.' => we return the attribute of the object
which name is the fragment with the leading dot removed
- fragment is an `int` => we return `obj[frag]`
- fragment is a `str` => we return `obj[frag]`
"""
Path = list[PathSegment]
"""A `Path` consists of a list of `PathSegment`."""
[docs]
class Base(
msgspec.Struct,
metaclass=TaggedStructMeta,
omit_defaults=True,
forbid_unknown_fields=True,
):
"""A base class holding some common settings.
- We set ``omit_defaults = True`` to omit any fields containing only their
default value from the output when encoding.
- We set ``forbid_unknown_fields = True`` to error nicely if an unknown
field is present in the serialized data. This helps catch typo errors
early.
"""
def _format(self, fmt: FormatSpec, level: int = 0) -> str: ...
TEXT_FORMAT = FormatSpec(space=' ', newline='\n', start_bold='', end_bold='')
ANSI_FORMAT = FormatSpec(space=' ', newline='\n', start_bold='\033[1m', end_bold='\033[22m')
HTML_FORMAT = FormatSpec(space=' ', newline='<br>', start_bold='<b>', end_bold='</b>')
[docs]
def as_plain_text(self) -> str:
"""Return an unstyled text representation suitable for outputting to a file."""
return self._format(Base.TEXT_FORMAT)
[docs]
def as_text(self) -> str:
"""Return a text representation suitable for printing in a terminal."""
return self._format(Base.ANSI_FORMAT)
[docs]
def as_html(self) -> str:
"""Return an HTML representation."""
return self._format(Base.HTML_FORMAT)
[docs]
def to_dict(self):
"""Return the object as dictionary of builtin types."""
return msgspec.to_builtins(self)
def follow(self, path: Path | PathSegment) -> Base:
"""Follow the given path or path segment starting from `self` and return the resulting object."""
...
# having kw_only=True allows us to inherit this class and still have positional args
# see: https://jcristharif.com/msgspec/structs.html#field-ordering
[docs]
class WithID(Base, kw_only=True):
"""Use this base class for defining models that need to have a `uuid` field."""
uuid: uuid.UUID = field(default_factory=uuid.uuid4)
################################################################################
## ##
## Types of information ##
## ##
################################################################################
UrlInformation = None
"""This class represents the information that can be extracted from the
URL alone.
It is initially defined as `None` but will be created dynamically when loading
the fetcher plugins to be the union of all of the fetchers `Info` classes. It
allows us to fully import the `datamodel` module and make it available to plugins.
This is fine as annotations are evaluated lazily and will only be required to
be correct when validating structs, and not when importing modules.
"""
[docs]
class ContentInformationBase(Base, tag='contentbase'):
"""This class represents the additional information that can be fetched
for this URL, by downloading and/or parsing its contents.
It contains at least the `data` field, which is the content pointed
at by this URL (html code most of the time, but could be binary,
e.g.: for PDFs, images, etc.)
"""
data: str | None # | bytes # FIXME: `str | bytes` not supported by msgspec
ContentInformation = ContentInformationBase
"""This class represents the additional information that can be fetched
for this URL, by downloading and/or parsing its contents.
It is initially defined as `ContentInformationBase` but will be created dynamically
when loading the fetcher plugins to be the union of all of the fetchers `Content` classes.
It allows us to fully import the `datamodel` module and make it available to plugins.
This is fine as annotations are evaluated lazily and will only be required to
be correct when validating structs, and not when importing modules.
"""
[docs]
class SemanticModel(Base):
# supposes an external enum defines the models - FIXME: `enum` instead of `str`
model: str
# template string for LLM models, None for embedding models
prompt: str | None
# should be compatible with the supported backend (ATM llama_cpp.Llama for LLMs,
# sentence_transformers.SentenceTransformer for embedding)
settings: dict | None
[docs]
class SemanticSummary(SemanticBaseInformation):
content: str
[docs]
class SemanticEmbedding(SemanticBaseInformation):
content: list[float] # better: some type of vector[float], numpy.array, torch.Tensor
################################################################################
## ##
## URL and Twig models ##
## ##
################################################################################
[docs]
class Url(WithID):
"""A URL with optional additional information."""
value: str # TODO: validate it's a proper url
url_type: str | None = None
info: UrlInformation | None = None
content: ContentInformation | None = None
semantic: SemanticInformation | None = None
def __str__(self) -> str:
if self.info is not None:
return f"Url('{self.value}', type={self.url_type})"
return self.value
def __repr__(self) -> str:
return str(self)
[docs]
class TimestampMixin(Base, kw_only=True):
# FIXME: when importing bookmarks from other format, we would probably like to keep the added created_at timestamp
created_at: datetime = field(default_factory=lambda: datetime.now().astimezone())
updated_at: datetime = field(default_factory=lambda: datetime.now().astimezone())
# class Twig(TimestampMixin, tag=True):
[docs]
class Twig(WithID, tag=True):
"""The `Twig` class is the main data item in MagPie.
It can be thought of as an augmented bookmark.
Instead of pointing to a single url, it stands more generally for an item
of interest, usually a webpage but possibly more, that we want to remember.
For example, a web page we want to bookmark could be associated with others
that are semantically related, such as HackerNews/Reddit discussions,
GitHub readme page for a software project, etc.
"""
title: str
url: Url
related: list[Url] = []
rating: int = 0 # 1-5 stars
tags: list[str] = []
notes: str | None = None # user notes/comments
[docs]
class Folder(WithID, tag=True):
"""A Folder has a name and a list of sub-folders and `Twig` contained inside it.
It is iterable and indexable, both by item position and object name, ie:
```
folder = Folder() # add some more elements into it...
folder.add(Twig(title='magpie website', url=Url('https://magpie.digitalgaia.net')))
first = folder[0] # this works
magpie = folder['magpie website'] # this works too
```
"""
name: str
items: list[Folder | Twig] = []
def __len__(self) -> int:
return len(self.items)
def __getitem__(self, index: int | str) -> Folder | Twig:
if isinstance(index, str):
for obj in self.items:
if match_name(obj, index):
return obj
raise IndexError(f'Could not find Folder/Twig with the name/title: "{index}"')
return self.items[index]
def __iter__(self):
return iter(self.items)
[docs]
def add(self, item: Folder | Twig):
self.items.append(item)
[docs]
def remove(self, index: int | str):
print(type(index))
if isinstance(index, int):
del self.items[index]
return
for i, item in enumerate(self.items):
if match_name(item, index):
del self.items[i]
return
raise ValueError(f'Could not delete folder/twig with name: {index}')
[docs]
@staticmethod
def from_urls(urls: list[str]) -> Folder:
"""Build a new `Folder` with the given list of URLs str."""
result = Folder(name='root')
result.items = [Twig(title=f'Web page at {url if isinstance(url, str) else url.value}',
url=Url(url) if isinstance(url, str) else url)
for url in urls]
return result
def find(self, uuid: uuid.UUID) -> WithID:
"""Return the object reachable from this folder with the given UUID.
Raises:
ValueError: if no such object could be found
"""
...
def find_path(self,
obj: WithID | None = None,
uuid: uuid.UUID | None = None,
as_str: bool = False) -> Path:
"""Return a path to the given object, or the object identified by its UUID.
You need to specify either `obj` or `uuid` but not both.
Args:
as_str: if true, return the PathSegments as str (name/title) when
applicable, or as integers otherwise
Raises:
ValueError: if both or none of `obj` and `uuid` is specified
ValueError: if no such object could be found
"""
...
def iter_twigs(self: Folder, *, depth_first=True): ...
def iter_urls(self: Folder, *, depth_first=True): ...
def iter_tree(self: Folder, *, depth_first=True, depth=0): ...
def iter_with_id(self: Folder, path=None): ...
[docs]
def match_name(obj: Twig | Folder, name: str) -> bool:
return ((isinstance(obj, Folder) and obj.name == name) or
(isinstance(obj, Twig) and obj.title == name))
################################################################################
## ##
## DataFetcher ##
## ##
################################################################################
[docs]
class DataFetcher:
"""The DataFetcher class is the base interface for all plugins that want
to fetch specific content.
They need to at least implement the `match()` method to declare whether they
can handle a certain URL.
"""
[docs]
@classmethod
def name(cls):
"""Return an identifying name for this DataFetcher. It is extracted as
the last part of the module name in which this class is defined."""
return cls.__module__.split('.')[-1]
[docs]
def match(self, url: Url) -> bool:
"""Return whether this DataFetcher is able to extract more
information from the given URL."""
raise NotImplementedError
[docs]
def fetch_additional_info(self, url: Url) -> ContentInformation:
"""Extract more information from the given URL, usually by
downloading and parsing some additional content or related
pages.
Examples:
- a GitHub data fetcher would download the Readme file
of a project
- a HackerNews fetcher would download comments from a
discussion and store them as a tree of comments, which
are tuples of (author, date, comment)
"""
pass
[docs]
def expand_data(self, folder: Folder):
"""Try to expand the database by passing the whole folder to
each registered fetcher and asking them if they can do it.
This allows the fetchers to not only find more information for
a single URL, but to manipulate the whole database in order to
reorganize it in case they need to.
It is a good idea to call `DataRetriever.identify(folder)` and
`DataRetriever.fetch(folder)` after this in order to get content for
additional twigs that might have been created.
"""
pass