Source code for tests.test_generic
import pytest
from magpie.datamodel import Folder, Url
from magpie.fetch.retriever import DataRetriever
from magpie.fetchers import generic, github, hackernews
[docs]
def test_generic():
"""Test HackerNews URL matching."""
gen = generic.Fetcher()
url = Url('http://digitalgaia.net')
assert gen.match(url)
[docs]
def test_generic_fetch():
"""
Test basic functionality of the HackerNews fetcher.
This doesn't use the Celery task queue.
"""
gen = generic.Fetcher()
url = Url('https://digitalgaia.net')
assert gen.match(url)
fetch = gen.fetch_additional_info(url)
assert fetch.title == 'Digital Gaia'
assert fetch.data.startswith('Digital Gaia “A place where nature and technology weave into one another…”')
# FIXME - redirects do not work
# https://billaboop.com -> https://billaboop.com/en
url = Url('https://billaboop.com/en')
assert gen.match(url)
fetch = gen.fetch_additional_info(url)
assert fetch.title == 'Architects of the wow effect'
assert fetch.data.startswith('Billaboop is a software company with more than 7 years of experience.')
[docs]
@pytest.mark.celery
def test_fetch_data():
"""Fetch content of a few blog posts and pages in magpie pipeline
"""
data = Folder.from_urls(['https://digitalgaia.net',
'https://billaboop.com/en',
'https://lwn.net/Articles/1021871/'])
magpie = DataRetriever()
magpie.identify(data)
magpie.fetch(data)
magpie.wait_for_tasks_completion(timeout=10)
assert len(data) == 3
twig = data[0]
assert isinstance(twig.url.content, generic.Content)
assert twig.url.content.title == 'Digital Gaia'
twig = data[1]
assert isinstance(twig.url.content, generic.Content)
assert twig.url.content.title == 'Architects of the wow effect'
twig = data[2]
assert isinstance(twig.url.content, generic.Content)
assert twig.url.content.title == 'Cory Doctorow on how we lost the internet'
[docs]
@pytest.mark.celery
def test_fetch_mixed_content():
"""Fetch content
"""
data = Folder.from_urls(['https://digitalgaia.net',
'https://news.ycombinator.com/item?id=43955842',
'https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama'])
magpie = DataRetriever()
magpie.identify(data)
magpie.fetch(data)
magpie.wait_for_tasks_completion(timeout=10)
assert len(data) == 3
twig = data[0]
assert isinstance(twig.url.content, generic.Content)
assert twig.url.content.title == 'Digital Gaia'
twig = data[1]
assert isinstance(twig.url.content, hackernews.Content)
assert twig.url.content.real_url == 'https://github.com/jaypyles/Scraperr'
twig = data[2]
assert isinstance(twig.url.content, github.Content)
assert twig.url.info.org == 'nebuly-ai'
assert twig.url.info.repo == 'nebullvm'
assert twig.url.info.path == 'tree/main/apps/accelerate/chatllama'
assert 'optimate' in twig.url.content.readme