Source code for EduData.DataSet.download_data.download_data

# coding: utf-8
# create by tongshiwei on 2019/7/2

__all__ = ["URL_DICT", "get_data", "list_resources"]

import os
import re
from urllib.request import urlretrieve

import requests
from bs4 import BeautifulSoup
from longling import config_logging, LogLevel, path_append, flush_print

from .utils import decompress, reporthook4urlretrieve, timestamp2time, format_byte_sizeof, get_uz_path

DEFAULT_DATADIR = path_append("./", "", to_str=True)

logger = config_logging(logger="downloader", console_log_level=LogLevel.INFO)

prefix = ''


def get_dataset_name():  # pragma: no cover
    urls = []
    for i in URL_DICT.values():
        if i not in urls:
    url = prefix
    r = requests.get(url, timeout=30)
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, "lxml")
    al = soup.find_all('a')
    for a in al:
        # 获得各个数据集名称
        h = a.get('href')
        if h[0] != '.':
            temp = prefix + h
            # 避免重复
            if temp not in urls:
                # 避免ASSISTment和junyi的重复
                if temp not in ['',
                    URL_DICT[h[:-1]] = temp

def download_file(url, save_path, override, chunksize=65535):
    downloaded = 0
    resume_tag = False
    if os.path.exists(save_path) and override: + ' will be overridden.')
    elif os.path.exists(save_path):
        # Resume download
        downloaded = os.stat(save_path).st_size
        local_timestamp = os.path.getctime(save_path)
        resume_tag = True

    old_downloaded = downloaded

    headers = {}
    if downloaded:
        headers['Range'] = 'bytes={}-'.format(downloaded)
        headers['If-Unmodified-Since'] = timestamp2time(local_timestamp) + ' is saved as ' + save_path)
    res = requests.get(url, headers=headers, stream=True, timeout=15)

    mode = 'wb+'
    content_len = int(res.headers.get('content-length'))
    # Check if server supports range feature, and works as expected.
    if res.status_code == 206:
        # Content range is in format `bytes 327675-43968289/43968290`, check
        # if it starts from where we requested.
        content_range = res.headers.get('content-range')
        # If file is already downloaded, it will reutrn `bytes */43968290`.

        if content_range and \
                int(content_range.split(' ')[-1].split('-')[0]) == downloaded:
            mode = 'ab+'

    elif res.status_code == 416:
        raise FileExistsError

    elif res.status_code == 412:
        # 如果所请求的资源在指定的时间之后发生了修改,那么会返回 412 (Precondition Failed) 错误。
        logger.warning("Resource Changed, should override. Redownloading...")
        urlretrieve(url, save_path, reporthook=reporthook4urlretrieve)

    if resume_tag:"{} already exists. Send resume request after {}".format(save_path, format_byte_sizeof(downloaded)))
    file_origin_size = content_len + old_downloaded
    with open(save_path, mode) as f:
        for chunk in res.iter_content(chunksize):
            downloaded += len(chunk)
            flush_print('Downloading %s %.2f%%: %s | %s' % (save_path, downloaded / file_origin_size * 100,
    return decompress(save_path)

def download_data(url, data_dir, override, bloom_filter: set = None):
    bloom_filter = set() if bloom_filter is None else bloom_filter

    if url in bloom_filter:  # pragma: no cover

    if url.endswith("/"):  # 以/结尾是文件夹,其余是文件
        _data_dir = path_append(data_dir, url.split('/')[-2], to_str=True)

        r = requests.get(url, timeout=30)
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, "lxml")
        al = soup.find_all('a')
        for a in al:
            # 获得链接名
            h = a.get('href')
            if h[0] != '.':
                url_h = url + h
                if url_h not in bloom_filter:
                    download_data(url_h, _data_dir, override, bloom_filter)
        return _data_dir
        os.makedirs(data_dir, exist_ok=True)
        save_path = path_append(data_dir, url.split('/')[-1], to_str=True)
        _data_dir = download_file(url, save_path, override)
        return _data_dir

[docs]def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None): """ Parameters ---------- dataset: str 数据集名 data_dir: str 数据存储目录 override: bool 是否覆盖已存在的文件 url_dict: 链接名称与链接映射 Returns ------- """ url_dict = URL_DICT if not url_dict else url_dict if dataset in url_dict: url = url_dict[dataset] elif re.match("http(s?)://.*", dataset): url = dataset else: raise ValueError("%s is neither a valid dataset name nor an url" % dataset) save_path = path_append(data_dir, url.split('/')[-1], to_str=True) try: return download_data(url, data_dir, override) except FileExistsError:"file existed, skipped") return get_uz_path(save_path)
def list_resources(): print("\n".join(URL_DICT))