Source code for downloader

#
# downloader.py
#
# Copyright (c) 2015-2016 Junpei Kawamoto
#
# This software is released under the MIT License.
#
# http://opensource.org/licenses/mit-license.php
#
""" Download objects from several services.
"""
import contextlib
import gzip
import io
import logging
import os
import re
import shutil
import subprocess
import sys
import urllib2
import urlparse


LOGGER = logging.getLogger(__name__)


def _open_url(url):
    """ Open a URL by urllib2 via gzip encoding.

    Args:
      url: URL string.

    Returns:
      Response object returned by urllib2.urlopen.
    """
    req = urllib2.Request(url)
    req.add_header("Accept-encoding", "gzip")
    return urllib2.urlopen(req)


def _copy_response(res, dest):
    """ Copy a response opened by urllib2 to a destination.

    Args:
      res: Response made by urllib2.urlopen.
      dest: Destination path.
    """
    if res.info().get("Content-Encoding") == "gzip":
        res = gzip.GzipFile(fileobj=io.BytesIO(res.read()))

    with open(dest, "wb") as fp:
        shutil.copyfileobj(res, fp)


[docs]def curl(url, dest): """ Download an object using curl. Args: url: Parsed URL specifying an object. dest: Destination path. Returns: Path for the downloaded file. """ with contextlib.closing(_open_url(urlparse.urlunparse(url))) as res: _copy_response(res, dest) return dest
[docs]def gsutil(url, dest): """ Download an object using gsutil. Args: url: Parsed URL specifying an object. dest: Destination path. Returns: Path for the downloaded file. """ p = subprocess.Popen( ["gsutil", "cp", urlparse.urlunparse(url), dest], stdout=sys.stdout, stderr=sys.stderr) p.wait() return dest
[docs]def dropbox(url, dest): """ Download an object from dropbox. Args: url: Parsed URL specifying an object. dest: Destination path. Returns: Downloaded filename. """ new_url = "https://{host}{path}?dl=1".format( host=url.netloc, path=url.path) with contextlib.closing(_open_url(new_url)) as res: disposition = res.info().getheader("content-disposition") match = re.search("filename=\"(.*)\";", disposition) if match and match.group(1).endswith(".zip"): dest += ".zip" _copy_response(res, dest) return dest
[docs]def download(url, unzip=True): """ Download an object specified by a url. Url can have a destination path. The format is - scheme://host/path - scheme://host/path:dest where dest is the destination path. Scheme is one of http, https, gs, dropbox. Args: url: An extended url specifying the url of an object and an destination path. unzip: If True and the object specified by the url is zipped, unzip them. """ # Check the url contains a destination path. dest = "." if url.find(":") != url.rfind(":"): dest = url[url.rfind(":") + 1:] url = url[:url.rfind(":")] # Parse the URL. purl = urlparse.urlparse(url) # If the destination path is a directory, use filename as same as URL. if os.path.isdir(dest) or dest[:-1] == "/": dest = os.path.join(dest, purl.path[purl.path.rfind("/") + 1:]) # Choose downloader. downloader = curl if purl.scheme == "gs": downloader = gsutil elif purl.scheme == "dropbox": downloader = dropbox LOGGER.info("Downloading %s to %s", url, dest) res = downloader(purl, dest) if unzip and res.endswith(".zip"): # If donloaded file is a zip, unzip and remove it. LOGGER.info("Unzipping %s", res) p = subprocess.Popen( ["unzip", "-o", "-d", os.path.dirname(res), res], stdout=sys.stdout, stderr=sys.stderr) p.communicate() os.remove(res) elif res.endswith(".tar.gz"): LOGGER.info("Unpacking %s", res) p = subprocess.Popen( ["tar", "-zxvf", res], cwd=os.path.dirname(res), stdout=sys.stdout, stderr=sys.stderr) p.communicate() os.remove(res) elif res.endswith(".tar"): LOGGER.info("Unpacking %s", res) p = subprocess.Popen( ["tar", "-xvf", res], cwd=os.path.dirname(res), stdout=sys.stdout, stderr=sys.stderr) p.communicate() os.remove(res)