From c85655f7209ad1ec30d6680f88ea826b08a98e83 Mon Sep 17 00:00:00 2001
From: Tin Tvrtkovic <tinchester@gmail.com>
Date: Sat, 2 Nov 2013 17:23:59 +0100
Subject: [PATCH] Modified the get_url module to respect the
 content-disposition header if the destination is a directory and the server
 provides it. See http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html,
 section 19.5.1.

---
 library/network/get_url | 74 +++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 18 deletions(-)

diff --git a/library/network/get_url b/library/network/get_url
index 35d724febe..58b1eb16aa 100644
--- a/library/network/get_url
+++ b/library/network/get_url
@@ -49,15 +49,20 @@ options:
   dest:
     description:
       - absolute path of where to download the file to.
-      - If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set.
+      - If C(dest) is a directory, either the server provided filename or, if
+        none provided, the base name of the URL on the remote server will be
+        used. If a directory, C(force) has no effect.
     required: true
     default: null
   force:
     description:
-      - If C(yes), will download the file every time and replace the
-        file if the contents change. If C(no), the file will only be downloaded if
-        the destination does not exist. Generally should be C(yes) only for small
-        local files. Prior to 0.6, this module behaved as if C(yes) was the default.
+      - If C(yes) and C(dest) is not a directory, will download the file every
+        time and replace the file if the contents change. If C(no), the file
+        will only be downloaded if the destination does not exist. Generally
+        should be C(yes) only for small local files. Prior to 0.6, this module
+        behaved as if C(yes) was the default.
+        Has no effect if C(dest) is a directory - the file will always be
+        downloaded, but replaced only if the contents changed.
     version_added: "0.7"
     required: false
     choices: [ "yes", "no" ]
@@ -125,7 +130,7 @@ def url_filename(url):
         return 'index.html'
     return fn
 
-def url_do_get(module, url, dest, use_proxy):
+def url_do_get(module, url, dest, use_proxy, last_mod_time):
     """
     Get url and return request and info
     Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@@ -171,9 +176,8 @@ def url_do_get(module, url, dest, use_proxy):
     request = urllib2.Request(url)
     request.add_header('User-agent', USERAGENT)
 
-    if os.path.exists(dest) and not module.params['force']:
-        t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest))
-        tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
+    if last_mod_time:
+        tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
         request.add_header('If-Modified-Since', tstamp)
 
     try:
@@ -190,14 +194,14 @@ def url_do_get(module, url, dest, use_proxy):
 
     return r, info
 
-def url_get(module, url, dest, use_proxy):
+def url_get(module, url, dest, use_proxy, last_mod_time):
     """
-    Download url and store at dest.
-    If dest is a directory, determine filename from url.
+    Download data from the url and store in a temporary file.
+
     Return (tempfile, info about the request)
     """
 
-    req, info = url_do_get(module, url, dest, use_proxy)
+    req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
 
     # TODO: should really handle 304, but how? src file could exist (and be newer) but empty
     if info['status'] == 304:
@@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
     req.close()
     return tempname, info
 
+def extract_filename_from_headers(headers):
+    """
+    Extracts a filename from the given dict of HTTP headers.
+
+    Looks for the content-disposition header and applies a regex.
+    Returns the filename if successful, else None."""
+    cont_disp_regex = 'attachment; ?filename="(.+)"'
+    res = None
+
+    if 'content-disposition' in headers:
+        cont_disp = headers['content-disposition']
+        match = re.match(cont_disp_regex, cont_disp)
+        if match:
+            res = match.group(1)
+            # Try preventing any funny business.
+            res = os.path.basename(res)
+
+    return res
+
 # ==============================================================
 # main
 
@@ -247,15 +270,30 @@ def main():
     sha256sum = module.params['sha256sum']
     use_proxy = module.params['use_proxy']
 
-    if os.path.isdir(dest):
-        dest = os.path.join(dest, url_filename(url))
+    dest_is_dir = os.path.isdir(dest)
+    last_mod_time = None
 
-    if not force:
-        if os.path.exists(dest):
+    if not dest_is_dir and os.path.exists(dest):
+        if not force:
             module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
 
+        # If the file already exists, prepare the last modified time for the
+        # request.
+        mtime = os.path.getmtime(dest)
+        last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
+
     # download to tmpsrc
-    tmpsrc, info = url_get(module, url, dest, use_proxy)
+    tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
+
+    # Now the request has completed, we can finally generate the final
+    # destination file name from the info dict.
+    if dest_is_dir:
+        filename = extract_filename_from_headers(info)
+        if not filename:
+            # Fall back to extracting the filename from the URL.
+            filename = url_filename(url)
+        dest = os.path.join(dest, filename)
+
     md5sum_src   = None
     md5sum_dest  = None