Source code for magpie.fetchers.github
import base64
import re
from loguru import logger
from magpie import config
from magpie.datamodel import Base, ContentInformationBase, DataFetcher, Url
from magpie.network import http_get, http_get_json
from magpie.util import clean_content
[docs]
class Info(Base):
org: str
repo: str
path: str = ''
[docs]
class Content(ContentInformationBase):
readme: str | None
[docs]
def snapshot(self):
return f'## {self.readme}'
[docs]
class Fetcher(DataFetcher):
GH_REGEXP = re.compile(r'https://github\.com/(?P<org>[\w.-]+)/(?P<repo>[\w.-]+)/?(?P<path>.*)')
[docs]
def match(self, url: Url) -> bool:
return url.value.startswith('https://github.com')
[docs]
def fetch_additional_info(self, url: Url) -> Content:
content, readme = None, None
try:
content = clean_content(http_get(url.value))
except Exception as e:
logger.warning(f"Failed to get cleaned content for {url}: {e}")
try:
readme = clean_content(self.get_readme(url))
except Exception as e:
logger.warning(f"Failed to call fetch README from GitHub API: {e}")
return Content(data=content, readme=readme)
[docs]
def get_readme(self, url: Url) -> str:
# Headers for the request
headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
token = config.GITHUB_TOKEN
if token:
# logger.debug(f'Using github token {token[:8]}') # make sure we don't leak secrets
headers['Authorization'] = f'Bearer {token}'
# see: https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-a-repository-readme
readme_url = f"https://api.github.com/repos/{url.info.org}/{url.info.repo}/readme"
# Get README.md file if it exists
readme_data = http_get_json(readme_url, headers=headers)
# Decode the content from base64
readme_content = readme_data['content']
readme_decoded = base64.b64decode(readme_content).decode('utf-8')
return readme_decoded