Source code for harvesttext.download_utils

import os
import shutil
import requests
import hashlib
from tqdm import tqdm
from collections import namedtuple
from os import environ, listdir, makedirs
from os.path import dirname, exists, expanduser, isdir, join, splitext

RemoteFileMetadata = namedtuple('RemoteFileMetadata',
                                ['filename', 'url', 'checksum'])

# config according to computer, this should be default setting of shadowsocks
DEFAULT_PROXIES = {
    'http': 'socks5h://127.0.0.1:1080',
    'https': 'socks5h://127.0.0.1:1080'
}

[docs]def get_data_home(data_home=None): """Return the path of the scikit-learn data dir. This folder is used by some large dataset loaders to avoid downloading the data several times. By default the data dir is set to a folder named 'scikit_learn_data' in the user home folder. Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment variable or programmatically by giving an explicit folder path. The '~' symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created. Parameters ---------- data_home : str | None The path to data dir. """ if data_home is None: data_home = environ.get('HARVESTTEXT_DATA', join('~', '.harvesttext')) data_home = expanduser(data_home) if not exists(data_home): makedirs(data_home) return data_home
[docs]def clear_data_home(data_home=None): """Delete all the content of the data home cache. Parameters ---------- data_home : str | None The path to data dir. """ data_home = get_data_home(data_home) shutil.rmtree(data_home)
def _sha256(path): """Calculate the sha256 hash of the file at path.""" sha256hash = hashlib.sha256() chunk_size = 8192 with open(path, "rb") as f: while True: buffer = f.read(chunk_size) if not buffer: break sha256hash.update(buffer) return sha256hash.hexdigest() def _download_with_bar(url, file_path, proxies=DEFAULT_PROXIES): # Streaming, so we can iterate over the response. response = requests.get(url, stream=True, proxies=proxies) total_size_in_bytes= int(response.headers.get('content-length', 0)) block_size = 1024 # 1 KB progress_bar = tqdm(total=total_size_in_bytes, unit='B', unit_scale=True) with open(file_path, 'wb') as file: for data in response.iter_content(block_size): progress_bar.update(len(data)) file.write(data) progress_bar.close() if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: raise Exception("ERROR, something went wrong with the downloading") return file_path def _fetch_remote(remote, dirname=None, use_proxy=False, proxies=DEFAULT_PROXIES): """Helper function to download a remote dataset into path Fetch a dataset pointed by remote's url, save into path using remote's filename and ensure its integrity based on the SHA256 Checksum of the downloaded file. Parameters ---------- remote : RemoteFileMetadata Named tuple containing remote dataset meta information: url, filename and checksum dirname : string Directory to save the file to. Returns ------- file_path: string Full path of the created file. """ file_path = (remote.filename if dirname is None else join(dirname, remote.filename)) proxies = None if not use_proxy else proxies file_path = _download_with_bar(remote.url, file_path, proxies) checksum = _sha256(file_path) if remote.checksum != checksum: raise IOError("{} has an SHA256 checksum ({}) " "differing from expected ({}), " "file may be corrupted.".format(file_path, checksum, remote.checksum)) return file_path
[docs]def download(remote, file_path=None, use_proxy=False, proxies=DEFAULT_PROXIES): data_home = get_data_home() file_path = _fetch_remote(remote, data_home, use_proxy, proxies) return file_path
[docs]def check_download_resource(remote, use_proxy=False, proxies=None): proxies = DEFAULT_PROXIES if use_proxy and proxies is None else proxies data_home = get_data_home() file_path = os.path.join(data_home, remote.filename) if not os.path.exists(file_path): # currently don't capture error at this level, assume download success file_path = download(remote, data_home) return file_path
if __name__ == "__main__": ARCHIVE = RemoteFileMetadata( filename='harvesttext-0.7.2-py3-none-any.whl', url='https://github.com/blmoistawinde/HarvestText/releases/download/V0.7.2/harvesttext-0.7.2-py3-none-any.whl', checksum='004c8b0b1858f69025f721bc84cff33127d53c6ab526beed7a7a801a9c21f30b') print("Download") file_path = download(ARCHIVE) print(file_path) # if proxy is available # print("Download using proxy") # file_path = download(ARCHIVE, use_proxy=True) # print(file_path)