Source code for magpie.fetchers.generic
import trafilatura
from loguru import logger
from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Folder, Url
from magpie.network import http_get
# Unused info class but this is needed by plugin validation
[docs]
class Content(ContentInformationBase):
"""Content for typical webpages or blogs that are not specialized
We reproduce the main fields extracted by https://github.com/adbar/trafilatura
"""
data: str
title: str | None
description: str | None
date: str | None
[docs]
def snapshot(self):
return f'# {self.title}\n\n## {self.description}\n\n## {self.data}'
[docs]
class Fetcher(DataFetcher):
[docs]
def match(self, url: Url) -> bool:
return True
[docs]
def clean_page(self, content: str) -> Content:
""" Generic webpage data extraction using [trafilatura](https://github.com/adbar/trafilatura)
default extraction settings
"""
result = trafilatura.bare_extraction(content, with_metadata=True).as_dict()
return Content(data=result.get('text', None),
title=result.get('title', None),
date=result.get('date', None),
description=result.get('description', None)
)
[docs]
def fetch_additional_info(self, url: Url) -> Content:
try:
response = http_get(url.value)
return self.clean_page(response)
except Exception as e:
logger.warning(f"Failed to get cleaned content for {url}: {e}")
return None
[docs]
def expand_data(self, folder: Folder):
# TODO: this could be the place where we look for related URL, i.e URLs that refer to the
# generic 'real_url' we are processing here
# Similar to having specialized fetchers, we could have specialized 'searchers' that look if
# - Real url is linked online (e.g. via HN, GH, Reddit search)
# - words / images from fetched content are mentionned online (e.g. via HN, GH, Reddit search)
pass