Source code for magpie.fetchers.github

import base64
import re

from loguru import logger

from magpie import config
from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Url
from magpie.network import http_get, http_get_json
from magpie.util import clean_content


[docs] class Info(Base): org: str repo: str path: str = ''
[docs] class Content(ContentInformationBase): readme: str | None
[docs] def snapshot(self): return f'## {self.readme}'
[docs] class Fetcher(DataFetcher): GH_REGEXP = re.compile(r'https://github\.com/(?P<org>[\w.-]+)/(?P<repo>[\w.-]+)/?(?P<path>.*)')
[docs] def match(self, url: Url) -> bool: return url.value.startswith('https://github.com')
[docs] def extract_info(self, url: Url) -> Info: if match := self.GH_REGEXP.match(url.value): return Info(**match.groupdict())
[docs] def fetch_additional_info(self, url: Url) -> Content: content, readme = None, None try: content = clean_content(http_get(url.value)) except Exception as e: logger.warning(f"Failed to get cleaned content for {url}: {e}") try: readme = clean_content(self.get_readme(url)) except Exception as e: logger.warning(f"Failed to call fetch README from GitHub API: {e}") return Content(data=content, readme=readme)
[docs] def get_readme(self, url: Url) -> str: # Headers for the request headers = { "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } token = config.GITHUB_TOKEN if token: # logger.debug(f'Using github token {token[:8]}') # make sure we don't leak secrets headers['Authorization'] = f'Bearer {token}' # see: https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-a-repository-readme readme_url = f"https://api.github.com/repos/{url.info.org}/{url.info.repo}/readme" # Get README.md file if it exists readme_data = http_get_json(readme_url, headers=headers) # Decode the content from base64 readme_content = readme_data['content'] readme_decoded = base64.b64decode(readme_content).decode('utf-8') return readme_decoded