Source code for magpie.fetchers.hackernews

import re

from loguru import logger

from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Folder, Url
from magpie.network import http_get, http_get_json
from magpie.util import clean_content


[docs] class Info(Base): id: str
[docs] class Content(ContentInformationBase): title: str | None real_url: str | None
[docs] def snapshot(self): return f'# {self.title}\n\n## {self.data}'
[docs] class Fetcher(DataFetcher): HN_REGEXP = re.compile(r'https?://news\.ycombinator\.com/item\?id=(?P<id>[0-9]+)')
[docs] def match(self, url: Url) -> bool: return self.HN_REGEXP.match(url.value) is not None
[docs] def extract_info(self, url: Url) -> Info: if match := self.HN_REGEXP.match(url.value): return Info(**match.groupdict())
[docs] def fetch_additional_info(self, url: Url) -> Content: content, title, real_url = None, None, None try: content = clean_content(http_get(url.value)) except Exception as e: logger.warning(f"Failed to get cleaned content for HN {url}: {e}") try: hn_id = self.extract_info(url).id hn_item = http_get_json(f'https://hacker-news.firebaseio.com/v0/item/{hn_id}.json') title = hn_item.get('title') real_url = hn_item.get('url') except Exception as e: logger.warning(f"Failed to get api data for HN {url}: {e}") return Content(data=content, title=title, real_url=real_url)
[docs] def expand_data(self, folder: Folder): for twig in folder.iter_twigs(): content = twig.url.content if isinstance(content, Content) and content.real_url: twig.related.append(twig.url) twig.url = Url(content.real_url) # TODO: do we want to overwrite the previous title here? probably yes if content.title: twig.title = content.title