Source code for magpie.fetchers.hackernews
import re
from loguru import logger
from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Folder, Url
from magpie.network import http_get, http_get_json
from magpie.util import clean_content
[docs]
class Info(Base):
id: str
[docs]
class Content(ContentInformationBase):
title: str | None
real_url: str | None
[docs]
def snapshot(self):
return f'# {self.title}\n\n## {self.data}'
[docs]
class Fetcher(DataFetcher):
HN_REGEXP = re.compile(r'https?://news\.ycombinator\.com/item\?id=(?P<id>[0-9]+)')
[docs]
def match(self, url: Url) -> bool:
return self.HN_REGEXP.match(url.value) is not None
[docs]
def fetch_additional_info(self, url: Url) -> Content:
content, title, real_url = None, None, None
try:
content = clean_content(http_get(url.value))
except Exception as e:
logger.warning(f"Failed to get cleaned content for HN {url}: {e}")
try:
hn_id = self.extract_info(url).id
hn_item = http_get_json(f'https://hacker-news.firebaseio.com/v0/item/{hn_id}.json')
title = hn_item.get('title')
real_url = hn_item.get('url')
except Exception as e:
logger.warning(f"Failed to get api data for HN {url}: {e}")
return Content(data=content, title=title, real_url=real_url)
[docs]
def expand_data(self, folder: Folder):
for twig in folder.iter_twigs():
content = twig.url.content
if isinstance(content, Content) and content.real_url:
twig.related.append(twig.url)
twig.url = Url(content.real_url)
# TODO: do we want to overwrite the previous title here? probably yes
if content.title:
twig.title = content.title