Source code for tests.test_hackernews

import pytest
from loguru import logger

from magpie.datamodel import Folder, Url
from magpie.fetch.retriever import DataRetriever
from magpie.fetchers import github, hackernews


[docs] def test_hackernews(): """Test HackerNews URL matching.""" hn = hackernews.Fetcher() assert not hn.match(Url('https://google.com')) url = Url('https://news.ycombinator.com/item?id=44063703') assert hn.match(url) info = hn.extract_info(url) assert type(info) is hackernews.Info assert info.id == '44063703'
[docs] def test_hackernews_fetch(): """ Test basic functionality of the HackerNews fetcher. This doesn't use the Celery task queue. """ hn = hackernews.Fetcher() url = Url('https://news.ycombinator.com/item?id=43955842') assert hn.match(url) info = hn.extract_info(url) assert type(info) is hackernews.Info assert info.id == '43955842' url.info = info fetch = hn.fetch_additional_info(url) assert fetch.title == 'Scraperr – A Self Hosted Webscraper' # noqa: RUF001 assert fetch.real_url == 'https://github.com/jaypyles/Scraperr'
[docs] @pytest.mark.celery def test_expand_data(): """Test a 2-step process where we first fetch the content of the HN link, and then expand the DB by moving the referred-to link as main URL for the twig and the original HN link as a related link. """ data = Folder.from_urls(['https://news.ycombinator.com/item?id=43955842']) magpie = DataRetriever() magpie.identify(data) magpie.fetch(data) magpie.wait_for_tasks_completion(timeout=10) assert len(data) == 1 twig = data[0] assert isinstance(twig.url.content, hackernews.Content) assert twig.url.content.real_url == 'https://github.com/jaypyles/Scraperr' magpie.expand_data(data) magpie.identify(data) magpie.fetch(data) magpie.wait_for_tasks_completion(timeout=10) logger.info(data.as_text()) assert isinstance(twig.url.content, github.Content) assert twig.url.content.readme.startswith('**A powerful self-hosted web scraping solution**') assert len(twig.related) == 1 assert twig.related[0].info.id == '43955842'