Source code for tests.test_hackernews

import pytest
from loguru import logger

from magpie.datamodel import Folder, Url
from magpie.fetch.retriever import DataRetriever
from magpie.fetchers import github, hackernews



[docs]
def test_hackernews():
    """Test HackerNews URL matching."""
    hn = hackernews.Fetcher()

    assert not hn.match(Url('https://google.com'))

    url = Url('https://news.ycombinator.com/item?id=44063703')
    assert hn.match(url)

    info = hn.extract_info(url)
    assert type(info) is hackernews.Info
    assert info.id == '44063703'




[docs]
def test_hackernews_fetch():
    """
    Test basic functionality of the HackerNews fetcher.
    This doesn't use the Celery task queue.
    """
    hn = hackernews.Fetcher()
    url = Url('https://news.ycombinator.com/item?id=43955842')
    assert hn.match(url)

    info = hn.extract_info(url)
    assert type(info) is hackernews.Info
    assert info.id == '43955842'

    url.info = info
    fetch = hn.fetch_additional_info(url)
    assert fetch.title == 'Scraperr – A Self Hosted Webscraper'  # noqa: RUF001
    assert fetch.real_url == 'https://github.com/jaypyles/Scraperr'




[docs]
@pytest.mark.celery
def test_expand_data():
    """Test a 2-step process where we first fetch the content of the HN link,
    and then expand the DB by moving the referred-to link as main URL for the twig
    and the original HN link as a related link.
    """

    data = Folder.from_urls(['https://news.ycombinator.com/item?id=43955842'])

    magpie = DataRetriever()
    magpie.identify(data)
    magpie.fetch(data)
    magpie.wait_for_tasks_completion(timeout=10)

    assert len(data) == 1
    twig = data[0]
    assert isinstance(twig.url.content, hackernews.Content)
    assert twig.url.content.real_url == 'https://github.com/jaypyles/Scraperr'

    magpie.expand_data(data)
    magpie.identify(data)
    magpie.fetch(data)
    magpie.wait_for_tasks_completion(timeout=10)

    logger.info(data.as_text())

    assert isinstance(twig.url.content, github.Content)
    assert twig.url.content.readme.startswith('**A powerful self-hosted web scraping solution**')
    assert len(twig.related) == 1
    assert twig.related[0].info.id == '43955842'