Source code for tests.test_generic

import pytest

from magpie.datamodel import Folder, Url
from magpie.fetch.retriever import DataRetriever
from magpie.fetchers import generic, github, hackernews


[docs] def test_generic(): """Test HackerNews URL matching.""" gen = generic.Fetcher() url = Url('http://digitalgaia.net') assert gen.match(url)
[docs] def test_generic_fetch(): """ Test basic functionality of the HackerNews fetcher. This doesn't use the Celery task queue. """ gen = generic.Fetcher() url = Url('https://digitalgaia.net') assert gen.match(url) fetch = gen.fetch_additional_info(url) assert fetch.title == 'Digital Gaia' assert fetch.data.startswith('Digital Gaia “A place where nature and technology weave into one another…”') # FIXME - redirects do not work # https://billaboop.com -> https://billaboop.com/en url = Url('https://billaboop.com/en') assert gen.match(url) fetch = gen.fetch_additional_info(url) assert fetch.title == 'Architects of the wow effect' assert fetch.data.startswith('Billaboop is a software company with more than 7 years of experience.')
[docs] @pytest.mark.celery def test_fetch_data(): """Fetch content of a few blog posts and pages in magpie pipeline """ data = Folder.from_urls(['https://digitalgaia.net', 'https://billaboop.com/en', 'https://lwn.net/Articles/1021871/']) magpie = DataRetriever() magpie.identify(data) magpie.fetch(data) magpie.wait_for_tasks_completion(timeout=10) assert len(data) == 3 twig = data[0] assert isinstance(twig.url.content, generic.Content) assert twig.url.content.title == 'Digital Gaia' twig = data[1] assert isinstance(twig.url.content, generic.Content) assert twig.url.content.title == 'Architects of the wow effect' twig = data[2] assert isinstance(twig.url.content, generic.Content) assert twig.url.content.title == 'Cory Doctorow on how we lost the internet'
[docs] @pytest.mark.celery def test_fetch_mixed_content(): """Fetch content """ data = Folder.from_urls(['https://digitalgaia.net', 'https://news.ycombinator.com/item?id=43955842', 'https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama']) magpie = DataRetriever() magpie.identify(data) magpie.fetch(data) magpie.wait_for_tasks_completion(timeout=10) assert len(data) == 3 twig = data[0] assert isinstance(twig.url.content, generic.Content) assert twig.url.content.title == 'Digital Gaia' twig = data[1] assert isinstance(twig.url.content, hackernews.Content) assert twig.url.content.real_url == 'https://github.com/jaypyles/Scraperr' twig = data[2] assert isinstance(twig.url.content, github.Content) assert twig.url.info.org == 'nebuly-ai' assert twig.url.info.repo == 'nebullvm' assert twig.url.info.path == 'tree/main/apps/accelerate/chatllama' assert 'optimate' in twig.url.content.readme