Source code for magpie.fetchers.generic

import trafilatura
from loguru import logger

from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Folder, Url
from magpie.network import http_get


# Unused info class but this is needed by plugin validation

[docs]
class Info(Base):
    pass




[docs]
class Content(ContentInformationBase):
    """Content for typical webpages or blogs that are not specialized
    We reproduce the main fields extracted by https://github.com/adbar/trafilatura
    """
    data: str
    title: str | None
    description: str | None
    date: str | None


[docs]
    def snapshot(self):
        return f'# {self.title}\n\n## {self.description}\n\n## {self.data}'





[docs]
class Fetcher(DataFetcher):


[docs]
    def match(self, url: Url) -> bool:
        return True



[docs]
    def clean_page(self, content: str) -> Content:
        """ Generic webpage data extraction using [trafilatura](https://github.com/adbar/trafilatura)
        default extraction settings
        """
        result = trafilatura.bare_extraction(content, with_metadata=True).as_dict()
        return Content(data=result.get('text', None),
                            title=result.get('title', None),
                            date=result.get('date', None),
                            description=result.get('description', None)
                            )



[docs]
    def fetch_additional_info(self, url: Url) -> Content:

        try:
            response = http_get(url.value)
            return self.clean_page(response)

        except Exception as e:
            logger.warning(f"Failed to get cleaned content for {url}: {e}")

        return None



[docs]
    def expand_data(self, folder: Folder):
        # TODO: this could be the place where we look for related URL, i.e URLs that refer to the
        # generic 'real_url' we are processing here
        # Similar to having specialized fetchers, we could have specialized 'searchers' that look if
        # - Real url is linked online (e.g. via HN, GH, Reddit search)
        # - words / images from fetched content are mentionned online (e.g. via HN, GH, Reddit search)
        pass