Source code for tests.test_github
import pytest
from loguru import logger
from magpie.datamodel import Folder, Url
from magpie.fetch import DataRetriever
from magpie.fetchers import github
# https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama (org/repo has readme)
# https://github.com/jonathandata1/Pegasus-CatalanGate-False-Positives (404)
# https://github.com/Lissy93/personal-security-checklist/blob/mas2ter/5_Privacy_Respecting_Software.md
[docs]
def test_github():
"""Test GitHub URL matching."""
gh = github.Fetcher()
assert not gh.match(Url('https://google.com'))
url = Url('https://github.com/digigaia/magpie')
assert gh.match(url)
info = gh.extract_info(url)
assert type(info) is github.Info
assert info.org == 'digigaia'
assert info.repo == 'magpie'
assert info.path == ''
[docs]
def test_github_fetch():
"""
Test basic functionality of the GitHub fetcher.
This doesn't use the Celery task queue.
"""
gh = github.Fetcher()
url = Url('https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama')
assert gh.match(url)
info = gh.extract_info(url)
assert type(info) is github.Info
assert info.org == 'nebuly-ai'
assert info.repo == 'nebullvm'
assert info.path == 'tree/main/apps/accelerate/chatllama'
url.info = info
fetch = gh.fetch_additional_info(url)
assert fetch.data is None # URL points to non-existent file, 404 not found
assert fetch.readme is not None
assert 'optimate' in fetch.readme
[docs]
@pytest.mark.celery
def test_github_fetch_readme():
"""
Create twigs that contain org/repo urls and use the GitHub fetcher to access README content
using the GitHub API.
The results are compared to expected text for known pages.
"""
# verified leading content of pages on 2025-05-08
gh_test = [ # (org, repo, expected_result)
('ggerganov', 'whisper.cpp', '# whisper.cpp'), # has README.md
('antonio-morales', 'Fuzzing101', '# Fuzzing-101'), # has Readme.md
('DanielMartensson', 'EmbeddedLapack', '# EmbeddedLapack'), # has ReadMe.md
('you-dont-need', 'You-Dont-Need-GUI', '# You Don\'t Need GUI'), # has readme.md
]
# create simple Folder with basic GH links
data = Folder.from_urls([f'https://github.com/{org}/{repo}' for org, repo, _ in gh_test])
expected_results = [expected for _, _, expected in gh_test]
magpie = DataRetriever()
magpie.identify(data)
magpie.fetch(data)
magpie.wait_for_tasks_completion(timeout=10)
logger.info(data.as_plain_text())
# iterate on data
for twig, expected in zip(data, expected_results):
assert twig.url.url_type == 'github'
assert twig.url.content.readme.startswith(expected)
[docs]
@pytest.mark.celery
def test_github_tricky_links():
"""Test some "tricky" GitHub links: file doesn't exist anymore, repo changed name, etc.
"""
urls = [
# org/repo has readme, but repo changed name and is discontinued
'https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama',
# 404, repo has been deleted
'https://github.com/jonathandata1/Pegasus-CatalanGate-False-Positives',
# 404, but repo is still valid and has a readme
'https://github.com/Lissy93/personal-security-checklist/blob/master/5_Privacy_Respecting_Software.md',
]
data = Folder.from_urls(urls)
magpie = DataRetriever()
magpie.identify(data)
magpie.fetch(data)
magpie.wait_for_tasks_completion(timeout=10)
logger.info(data.as_text())
url = data[0].url
assert isinstance(url.info, github.Info)
assert url.info.org == 'nebuly-ai'
assert url.info.repo == 'nebullvm'
assert url.content.data is None
assert url.content.readme.startswith('# OptiMate **[Legacy]** This repository is now in a legacy phase')
url = data[1].url
assert isinstance(url.info, github.Info)
assert url.info.org == 'jonathandata1'
assert url.info.repo == 'Pegasus-CatalanGate-False-Positives'
assert url.content.data is None
assert url.content.readme is None
url = data[2].url
assert isinstance(url.info, github.Info)
assert url.info.org == 'Lissy93'
assert url.info.repo == 'personal-security-checklist'
assert url.content.data is None
assert url.content.readme.startswith('Personal Security Checklist')