Source code for magpie.fetch.cache

import os
import re
import shutil
from html import unescape
from os.path import exists, join
from pathlib import Path
from urllib.parse import unquote, urlparse

import duct
from celery import chain, group, shared_task
from loguru import logger

from magpie.config import CACHE_DIR
from magpie.datamodel import Folder, Twig

WGET_OPTS = [
    '--span-hosts',             # -H, do not limit download of assets to page host
    '--adjust-extension',       # -E, adjust extension for CGI-generated material
    '--trust-server-names',     # for dynamic files, use name given by server
    # '--content-disposition',    # same as above but `--trust-server-names` seems to be the better option
    '--convert-links',          # -k, convert links on the page to point to the ones downloaded
    '--page-requisites',        # -p, download all necessary resources to properly display page
    # '--timestamping',           # -N, do not re-downloaded files that haven't been modified
    '--execute', 'robots=off',  # ignore robots.txt
    # '--backup-converted',       # -K, backup original file when converting a file
    # '--restrict-file-names=nocontrol',
]


[docs] @shared_task def cache_url_duct(url: str): logger.info(f'Cache URL: {url}') cmd = [ 'wget', *WGET_OPTS, url ] logger.trace(f'Running command: {' '.join(cmd)}') r = duct.cmd(*cmd).dir(CACHE_DIR).stderr_to_stdout().unchecked().read() logger.trace(r)
[docs] def cache_urls_in_folder(folder: Folder, cache_url): for item in folder: if isinstance(item, Folder): cache_urls_in_folder(item, cache_url) elif isinstance(item, Twig): cache_url(item.url) logger.warning(f'cache_url {item.url} returns') else: raise TypeError(f'Invalid type for caching item: {item} (type={type(item)})')
[docs] def url_to_cached_url(url: str) -> str: r = urlparse(url) logger.debug(r) cached_file = r._replace(scheme='', netloc=f'{CACHE_DIR}/{r.netloc}').geturl()[2:] logger.debug(cached_file) if exists(Path(cached_file) / 'index.html'): return 'file://' + str(Path(cached_file) / 'index.html') elif exists(cached_file + '.html'): return 'file://' + cached_file + '.html' else: return 'file://' + cached_file
[docs] def clear_cache(): shutil.rmtree(CACHE_DIR) os.makedirs(CACHE_DIR, exist_ok=True)
[docs] def refresh_cache_celery(collection: Folder): logger.debug(collection.as_text()) urls = [] def cache_url(url): urls.append(url.value) # this just builds the full list of URLs to cache by traversing our collection # and adding those URLs to the `urls` variable cache_urls_in_folder(collection, cache_url) # TODO: notify progress in status bar # use the `run_task()` function in `magpie.task.queue` to do so tasks = group(cache_url_duct.s(url) for url in urls) tasks = chain(tasks, fix_extensionless_links.s()) tasks()
[docs] def test_cache_url(): import time from magpie.tasks import cache_url logger.info('░▒▓ TEST CACHE URL USING CELERY TASK QUEUE ▓▒░') url = 'https://docs.astral.sh/uv/' result = cache_url.delay(url) while not result.ready(): logger.info('...') time.sleep(1) logger.info('Fix dynamic links that `--content-disposition` didn\'t') fix_extensionless_links()
# test: check that cached page displays the bar graph at the top