Source code for tests.test_github

import pytest
from loguru import logger

from magpie.datamodel import Folder, Url
from magpie.fetch import DataRetriever
from magpie.fetchers import github

# https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama (org/repo has readme)
# https://github.com/jonathandata1/Pegasus-CatalanGate-False-Positives (404)
# https://github.com/Lissy93/personal-security-checklist/blob/mas2ter/5_Privacy_Respecting_Software.md


[docs] def test_github(): """Test GitHub URL matching.""" gh = github.Fetcher() assert not gh.match(Url('https://google.com')) url = Url('https://github.com/digigaia/magpie') assert gh.match(url) info = gh.extract_info(url) assert type(info) is github.Info assert info.org == 'digigaia' assert info.repo == 'magpie' assert info.path == ''
[docs] def test_github_fetch(): """ Test basic functionality of the GitHub fetcher. This doesn't use the Celery task queue. """ gh = github.Fetcher() url = Url('https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllama') assert gh.match(url) info = gh.extract_info(url) assert type(info) is github.Info assert info.org == 'nebuly-ai' assert info.repo == 'nebullvm' assert info.path == 'tree/main/apps/accelerate/chatllama' url.info = info fetch = gh.fetch_additional_info(url) assert fetch.data is None # URL points to non-existent file, 404 not found assert fetch.readme is not None assert 'optimate' in fetch.readme
[docs] @pytest.mark.celery def test_github_fetch_readme(): """ Create twigs that contain org/repo urls and use the GitHub fetcher to access README content using the GitHub API. The results are compared to expected text for known pages. """ # verified leading content of pages on 2025-05-08 gh_test = [ # (org, repo, expected_result) ('ggerganov', 'whisper.cpp', '# whisper.cpp'), # has README.md ('antonio-morales', 'Fuzzing101', '# Fuzzing-101'), # has Readme.md ('DanielMartensson', 'EmbeddedLapack', '# EmbeddedLapack'), # has ReadMe.md ('you-dont-need', 'You-Dont-Need-GUI', '# You Don\'t Need GUI'), # has readme.md ] # create simple Folder with basic GH links data = Folder.from_urls([f'https://github.com/{org}/{repo}' for org, repo, _ in gh_test]) expected_results = [expected for _, _, expected in gh_test] magpie = DataRetriever() magpie.identify(data) magpie.fetch(data) magpie.wait_for_tasks_completion(timeout=10) logger.info(data.as_plain_text()) # iterate on data for twig, expected in zip(data, expected_results): assert twig.url.url_type == 'github' assert twig.url.content.readme.startswith(expected)