如何在allennlp.common.file_utils中的cached_path()函数中添加自定义下载源

发布时间：2024-01-15 03:42:28

在allennlp.common.file_utils模块中的cached_path()函数中添加自定义下载源，可以通过添加一个新的参数，用于指定下载源的URL。以下是一个示例代码：

import hashlib
import os
import shutil
import tempfile
from typing import Optional

import requests

_DOWNLOAD_CACHE = os.path.expanduser('~/.allennlp/cache/')
_CUSTOM_DOWNLOAD_URL = "https://my-custom-url.com/"


def cached_path_custom(url: str, cache_dir: Optional[str] = None, filename: Optional[str] = None) -> str:
    """
    This function is similar to the cached_path() function in allennlp.common.file_utils module,
    but it allows specifying a custom download URL.

    # Parameters

    url : str
        The URL to download the file from.
    cache_dir : str, optional (default=None)
        The directory to cache the downloaded file. If None, it uses ~/.allennlp/cache/.
    filename : str, optional (default=None)
        The filename to save the downloaded file as. If None, it uses the filename in the URL.

    # Returns

    str
        The path to the cached file.

    """
    if cache_dir is None:
        cache_dir = _DOWNLOAD_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    if filename is None:
        filename = url.split('/')[-1]

    cache_path = os.path.join(cache_dir, filename)

    if os.path.exists(cache_path):
        return cache_path

    response = requests.get(_CUSTOM_DOWNLOAD_URL + url, stream=True)

    # Check if the server does not support range requests.
    if 'accept-ranges' not in response.headers:
        # Delete any existing file with the same name in the cache path.
        if os.path.exists(cache_path):
            os.remove(cache_path)
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    temp_file.write(chunk)
        shutil.move(temp_file.name, cache_path)

    # Check if the server supports range requests.
    elif 'content-length' in response.headers:
        total_size = int(response.headers['content-length'])
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            downloaded_size = 0
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    temp_file.write(chunk)
                    downloaded_size += len(chunk)
                    print(f"\rDownloading {filename}: {downloaded_size}/{total_size} bytes", end=' ')
            shutil.move(temp_file.name, cache_path)

    return cache_path

这里我们添加了一个名为cached_path_custom()的新函数，在其中使用requests库连接自定义下载源。在内部，函数会检查服务器是否支持分段请求，以便在下载时显示进度条。如果不支持分段请求，它会创建一个临时文件，把下载内容写入该文件，然后将其移动到缓存路径中。如果服务器支持分段请求，它将逐块写入临时文件，同时在控制台显示下载进度，并将最终的临时文件移动到缓存路径中。

以下是一个使用该函数的例子：

from allennlp.common.file_utils import cached_path_custom

url = "example.zip"
cache_dir = "/path/to/cache_dir"

cached_file = cached_path_custom(url, cache_dir=cache_dir)
print(cached_file)

在此例中，我们使用自定义URL "example.zip" 和指定的缓存路径"/path/to/cache_dir" 来下载文件并从cached_path_custom()函数中获取路径。