Source code for tests.test_hackernews
import pytest
from loguru import logger
from magpie.datamodel import Folder, Url
from magpie.fetch.retriever import DataRetriever
from magpie.fetchers import github, hackernews
[docs]
def test_hackernews():
"""Test HackerNews URL matching."""
hn = hackernews.Fetcher()
assert not hn.match(Url('https://google.com'))
url = Url('https://news.ycombinator.com/item?id=44063703')
assert hn.match(url)
info = hn.extract_info(url)
assert type(info) is hackernews.Info
assert info.id == '44063703'
[docs]
def test_hackernews_fetch():
"""
Test basic functionality of the HackerNews fetcher.
This doesn't use the Celery task queue.
"""
hn = hackernews.Fetcher()
url = Url('https://news.ycombinator.com/item?id=43955842')
assert hn.match(url)
info = hn.extract_info(url)
assert type(info) is hackernews.Info
assert info.id == '43955842'
url.info = info
fetch = hn.fetch_additional_info(url)
assert fetch.title == 'Scraperr – A Self Hosted Webscraper' # noqa: RUF001
assert fetch.real_url == 'https://github.com/jaypyles/Scraperr'
[docs]
@pytest.mark.celery
def test_expand_data():
"""Test a 2-step process where we first fetch the content of the HN link,
and then expand the DB by moving the referred-to link as main URL for the twig
and the original HN link as a related link.
"""
data = Folder.from_urls(['https://news.ycombinator.com/item?id=43955842'])
magpie = DataRetriever()
magpie.identify(data)
magpie.fetch(data)
magpie.wait_for_tasks_completion(timeout=10)
assert len(data) == 1
twig = data[0]
assert isinstance(twig.url.content, hackernews.Content)
assert twig.url.content.real_url == 'https://github.com/jaypyles/Scraperr'
magpie.expand_data(data)
magpie.identify(data)
magpie.fetch(data)
magpie.wait_for_tasks_completion(timeout=10)
logger.info(data.as_text())
assert isinstance(twig.url.content, github.Content)
assert twig.url.content.readme.startswith('**A powerful self-hosted web scraping solution**')
assert len(twig.related) == 1
assert twig.related[0].info.id == '43955842'