From c85655f7209ad1ec30d6680f88ea826b08a98e83 Mon Sep 17 00:00:00 2001 From: Tin Tvrtkovic Date: Sat, 2 Nov 2013 17:23:59 +0100 Subject: [PATCH] Modified the get_url module to respect the content-disposition header if the destination is a directory and the server provides it. See http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html, section 19.5.1. --- library/network/get_url | 74 +++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/library/network/get_url b/library/network/get_url index 35d724febe..58b1eb16aa 100644 --- a/library/network/get_url +++ b/library/network/get_url @@ -49,15 +49,20 @@ options: dest: description: - absolute path of where to download the file to. - - If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set. + - If C(dest) is a directory, either the server provided filename or, if + none provided, the base name of the URL on the remote server will be + used. If a directory, C(force) has no effect. required: true default: null force: description: - - If C(yes), will download the file every time and replace the - file if the contents change. If C(no), the file will only be downloaded if - the destination does not exist. Generally should be C(yes) only for small - local files. Prior to 0.6, this module behaved as if C(yes) was the default. + - If C(yes) and C(dest) is not a directory, will download the file every + time and replace the file if the contents change. If C(no), the file + will only be downloaded if the destination does not exist. Generally + should be C(yes) only for small local files. Prior to 0.6, this module + behaved as if C(yes) was the default. + Has no effect if C(dest) is a directory - the file will always be + downloaded, but replaced only if the contents changed. version_added: "0.7" required: false choices: [ "yes", "no" ] @@ -125,7 +130,7 @@ def url_filename(url): return 'index.html' return fn -def url_do_get(module, url, dest, use_proxy): +def url_do_get(module, url, dest, use_proxy, last_mod_time): """ Get url and return request and info Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp @@ -171,9 +176,8 @@ def url_do_get(module, url, dest, use_proxy): request = urllib2.Request(url) request.add_header('User-agent', USERAGENT) - if os.path.exists(dest) and not module.params['force']: - t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest)) - tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000') + if last_mod_time: + tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000') request.add_header('If-Modified-Since', tstamp) try: @@ -190,14 +194,14 @@ def url_do_get(module, url, dest, use_proxy): return r, info -def url_get(module, url, dest, use_proxy): +def url_get(module, url, dest, use_proxy, last_mod_time): """ - Download url and store at dest. - If dest is a directory, determine filename from url. + Download data from the url and store in a temporary file. + Return (tempfile, info about the request) """ - req, info = url_do_get(module, url, dest, use_proxy) + req, info = url_do_get(module, url, dest, use_proxy, last_mod_time) # TODO: should really handle 304, but how? src file could exist (and be newer) but empty if info['status'] == 304: @@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy): req.close() return tempname, info +def extract_filename_from_headers(headers): + """ + Extracts a filename from the given dict of HTTP headers. + + Looks for the content-disposition header and applies a regex. + Returns the filename if successful, else None.""" + cont_disp_regex = 'attachment; ?filename="(.+)"' + res = None + + if 'content-disposition' in headers: + cont_disp = headers['content-disposition'] + match = re.match(cont_disp_regex, cont_disp) + if match: + res = match.group(1) + # Try preventing any funny business. + res = os.path.basename(res) + + return res + # ============================================================== # main @@ -247,15 +270,30 @@ def main(): sha256sum = module.params['sha256sum'] use_proxy = module.params['use_proxy'] - if os.path.isdir(dest): - dest = os.path.join(dest, url_filename(url)) + dest_is_dir = os.path.isdir(dest) + last_mod_time = None - if not force: - if os.path.exists(dest): + if not dest_is_dir and os.path.exists(dest): + if not force: module.exit_json(msg="file already exists", dest=dest, url=url, changed=False) + # If the file already exists, prepare the last modified time for the + # request. + mtime = os.path.getmtime(dest) + last_mod_time = datetime.datetime.utcfromtimestamp(mtime) + # download to tmpsrc - tmpsrc, info = url_get(module, url, dest, use_proxy) + tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time) + + # Now the request has completed, we can finally generate the final + # destination file name from the info dict. + if dest_is_dir: + filename = extract_filename_from_headers(info) + if not filename: + # Fall back to extracting the filename from the URL. + filename = url_filename(url) + dest = os.path.join(dest, filename) + md5sum_src = None md5sum_dest = None