Source code for magpie.fetchers.generic

import trafilatura
from loguru import logger

from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Folder, Url
from magpie.network import http_get


# Unused info class but this is needed by plugin validation
[docs] class Info(Base): pass
[docs] class Content(ContentInformationBase): """Content for typical webpages or blogs that are not specialized We reproduce the main fields extracted by https://github.com/adbar/trafilatura """ data: str title: str | None description: str | None date: str | None
[docs] def snapshot(self): return f'# {self.title}\n\n## {self.description}\n\n## {self.data}'
[docs] class Fetcher(DataFetcher):
[docs] def match(self, url: Url) -> bool: return True
[docs] def clean_page(self, content: str) -> Content: """ Generic webpage data extraction using [trafilatura](https://github.com/adbar/trafilatura) default extraction settings """ result = trafilatura.bare_extraction(content, with_metadata=True).as_dict() return Content(data=result.get('text', None), title=result.get('title', None), date=result.get('date', None), description=result.get('description', None) )
[docs] def fetch_additional_info(self, url: Url) -> Content: try: response = http_get(url.value) return self.clean_page(response) except Exception as e: logger.warning(f"Failed to get cleaned content for {url}: {e}") return None
[docs] def expand_data(self, folder: Folder): # TODO: this could be the place where we look for related URL, i.e URLs that refer to the # generic 'real_url' we are processing here # Similar to having specialized fetchers, we could have specialized 'searchers' that look if # - Real url is linked online (e.g. via HN, GH, Reddit search) # - words / images from fetched content are mentionned online (e.g. via HN, GH, Reddit search) pass