download.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
  1. import cgi
  2. import email.utils
  3. import hashlib
  4. import getpass
  5. import mimetypes
  6. import os
  7. import platform
  8. import re
  9. import shutil
  10. import sys
  11. import tempfile
  12. import pip
  13. from pip.backwardcompat import urllib, urlparse, raw_input
  14. from pip.exceptions import InstallationError, HashMismatch
  15. from pip.util import (splitext, rmtree, format_size, display_path,
  16. backup_dir, ask_path_exists, unpack_file,
  17. create_download_cache_folder, cache_download)
  18. from pip.vcs import vcs
  19. from pip.log import logger
  20. from pip._vendor import requests, six
  21. from pip._vendor.requests.adapters import BaseAdapter
  22. from pip._vendor.requests.auth import AuthBase, HTTPBasicAuth
  23. from pip._vendor.requests.compat import IncompleteRead
  24. from pip._vendor.requests.exceptions import InvalidURL, ChunkedEncodingError
  25. from pip._vendor.requests.models import Response
  26. from pip._vendor.requests.structures import CaseInsensitiveDict
  27. __all__ = ['get_file_content',
  28. 'is_url', 'url_to_path', 'path_to_url',
  29. 'is_archive_file', 'unpack_vcs_link',
  30. 'unpack_file_url', 'is_vcs_url', 'is_file_url', 'unpack_http_url']
  31. def user_agent():
  32. """Return a string representing the user agent."""
  33. _implementation = platform.python_implementation()
  34. if _implementation == 'CPython':
  35. _implementation_version = platform.python_version()
  36. elif _implementation == 'PyPy':
  37. _implementation_version = '%s.%s.%s' % (sys.pypy_version_info.major,
  38. sys.pypy_version_info.minor,
  39. sys.pypy_version_info.micro)
  40. if sys.pypy_version_info.releaselevel != 'final':
  41. _implementation_version = ''.join([
  42. _implementation_version,
  43. sys.pypy_version_info.releaselevel,
  44. ])
  45. elif _implementation == 'Jython':
  46. _implementation_version = platform.python_version() # Complete Guess
  47. elif _implementation == 'IronPython':
  48. _implementation_version = platform.python_version() # Complete Guess
  49. else:
  50. _implementation_version = 'Unknown'
  51. try:
  52. p_system = platform.system()
  53. p_release = platform.release()
  54. except IOError:
  55. p_system = 'Unknown'
  56. p_release = 'Unknown'
  57. return " ".join(['pip/%s' % pip.__version__,
  58. '%s/%s' % (_implementation, _implementation_version),
  59. '%s/%s' % (p_system, p_release)])
  60. class MultiDomainBasicAuth(AuthBase):
  61. def __init__(self, prompting=True):
  62. self.prompting = prompting
  63. self.passwords = {}
  64. def __call__(self, req):
  65. parsed = urlparse.urlparse(req.url)
  66. # Get the netloc without any embedded credentials
  67. netloc = parsed.netloc.split("@", 1)[-1]
  68. # Set the url of the request to the url without any credentials
  69. req.url = urlparse.urlunparse(parsed[:1] + (netloc,) + parsed[2:])
  70. # Use any stored credentials that we have for this netloc
  71. username, password = self.passwords.get(netloc, (None, None))
  72. # Extract credentials embedded in the url if we have none stored
  73. if username is None:
  74. username, password = self.parse_credentials(parsed.netloc)
  75. if username or password:
  76. # Store the username and password
  77. self.passwords[netloc] = (username, password)
  78. # Send the basic auth with this request
  79. req = HTTPBasicAuth(username or "", password or "")(req)
  80. # Attach a hook to handle 401 responses
  81. req.register_hook("response", self.handle_401)
  82. return req
  83. def handle_401(self, resp, **kwargs):
  84. # We only care about 401 responses, anything else we want to just
  85. # pass through the actual response
  86. if resp.status_code != 401:
  87. return resp
  88. # We are not able to prompt the user so simple return the response
  89. if not self.prompting:
  90. return resp
  91. parsed = urlparse.urlparse(resp.url)
  92. # Prompt the user for a new username and password
  93. username = raw_input("User for %s: " % parsed.netloc)
  94. password = getpass.getpass("Password: ")
  95. # Store the new username and password to use for future requests
  96. if username or password:
  97. self.passwords[parsed.netloc] = (username, password)
  98. # Consume content and release the original connection to allow our new
  99. # request to reuse the same one.
  100. resp.content
  101. resp.raw.release_conn()
  102. # Add our new username and password to the request
  103. req = HTTPBasicAuth(username or "", password or "")(resp.request)
  104. # Send our new request
  105. new_resp = resp.connection.send(req, **kwargs)
  106. new_resp.history.append(resp)
  107. return new_resp
  108. def parse_credentials(self, netloc):
  109. if "@" in netloc:
  110. userinfo = netloc.rsplit("@", 1)[0]
  111. if ":" in userinfo:
  112. return userinfo.split(":", 1)
  113. return userinfo, None
  114. return None, None
  115. class LocalFSResponse(object):
  116. def __init__(self, fileobj):
  117. self.fileobj = fileobj
  118. def __getattr__(self, name):
  119. return getattr(self.fileobj, name)
  120. def read(self, amt=None, decode_content=None, cache_content=False):
  121. return self.fileobj.read(amt)
  122. # Insert Hacks to Make Cookie Jar work w/ Requests
  123. @property
  124. def _original_response(self):
  125. class FakeMessage(object):
  126. def getheaders(self, header):
  127. return []
  128. def get_all(self, header, default):
  129. return []
  130. class FakeResponse(object):
  131. @property
  132. def msg(self):
  133. return FakeMessage()
  134. return FakeResponse()
  135. class LocalFSAdapter(BaseAdapter):
  136. def send(self, request, stream=None, timeout=None, verify=None, cert=None,
  137. proxies=None):
  138. parsed_url = urlparse.urlparse(request.url)
  139. # We only work for requests with a host of localhost
  140. if parsed_url.netloc.lower() != "localhost":
  141. raise InvalidURL("Invalid URL %r: Only localhost is allowed" %
  142. request.url)
  143. real_url = urlparse.urlunparse(parsed_url[:1] + ("",) + parsed_url[2:])
  144. pathname = url_to_path(real_url)
  145. resp = Response()
  146. resp.status_code = 200
  147. resp.url = real_url
  148. stats = os.stat(pathname)
  149. modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
  150. resp.headers = CaseInsensitiveDict({
  151. "Content-Type": mimetypes.guess_type(pathname)[0] or "text/plain",
  152. "Content-Length": stats.st_size,
  153. "Last-Modified": modified,
  154. })
  155. resp.raw = LocalFSResponse(open(pathname, "rb"))
  156. resp.close = resp.raw.close
  157. return resp
  158. def close(self):
  159. pass
  160. class PipSession(requests.Session):
  161. timeout = None
  162. def __init__(self, *args, **kwargs):
  163. super(PipSession, self).__init__(*args, **kwargs)
  164. # Attach our User Agent to the request
  165. self.headers["User-Agent"] = user_agent()
  166. # Attach our Authentication handler to the session
  167. self.auth = MultiDomainBasicAuth()
  168. # Enable file:// urls
  169. self.mount("file://", LocalFSAdapter())
  170. def request(self, method, url, *args, **kwargs):
  171. # Make file:// urls not fail due to lack of a hostname
  172. parsed = urlparse.urlparse(url)
  173. if parsed.scheme == "file":
  174. url = urlparse.urlunparse(parsed[:1] + ("localhost",) + parsed[2:])
  175. # Allow setting a default timeout on a session
  176. kwargs.setdefault("timeout", self.timeout)
  177. # Dispatch the actual request
  178. return super(PipSession, self).request(method, url, *args, **kwargs)
  179. def get_file_content(url, comes_from=None, session=None):
  180. """Gets the content of a file; it may be a filename, file: URL, or
  181. http: URL. Returns (location, content). Content is unicode."""
  182. if session is None:
  183. session = PipSession()
  184. match = _scheme_re.search(url)
  185. if match:
  186. scheme = match.group(1).lower()
  187. if (scheme == 'file' and comes_from
  188. and comes_from.startswith('http')):
  189. raise InstallationError(
  190. 'Requirements file %s references URL %s, which is local'
  191. % (comes_from, url))
  192. if scheme == 'file':
  193. path = url.split(':', 1)[1]
  194. path = path.replace('\\', '/')
  195. match = _url_slash_drive_re.match(path)
  196. if match:
  197. path = match.group(1) + ':' + path.split('|', 1)[1]
  198. path = urllib.unquote(path)
  199. if path.startswith('/'):
  200. path = '/' + path.lstrip('/')
  201. url = path
  202. else:
  203. ## FIXME: catch some errors
  204. resp = session.get(url)
  205. resp.raise_for_status()
  206. if six.PY3:
  207. return resp.url, resp.text
  208. else:
  209. return resp.url, resp.content
  210. try:
  211. f = open(url)
  212. content = f.read()
  213. except IOError:
  214. e = sys.exc_info()[1]
  215. raise InstallationError('Could not open requirements file: %s' % str(e))
  216. else:
  217. f.close()
  218. return url, content
  219. _scheme_re = re.compile(r'^(http|https|file):', re.I)
  220. _url_slash_drive_re = re.compile(r'/*([a-z])\|', re.I)
  221. def is_url(name):
  222. """Returns true if the name looks like a URL"""
  223. if ':' not in name:
  224. return False
  225. scheme = name.split(':', 1)[0].lower()
  226. return scheme in ['http', 'https', 'file', 'ftp'] + vcs.all_schemes
  227. def url_to_path(url):
  228. """
  229. Convert a file: URL to a path.
  230. """
  231. assert url.startswith('file:'), (
  232. "You can only turn file: urls into filenames (not %r)" % url)
  233. path = url[len('file:'):].lstrip('/')
  234. path = urllib.unquote(path)
  235. if _url_drive_re.match(path):
  236. path = path[0] + ':' + path[2:]
  237. else:
  238. path = '/' + path
  239. return path
  240. _drive_re = re.compile('^([a-z]):', re.I)
  241. _url_drive_re = re.compile('^([a-z])[:|]', re.I)
  242. def path_to_url(path):
  243. """
  244. Convert a path to a file: URL. The path will be made absolute and have
  245. quoted path parts.
  246. """
  247. path = os.path.normpath(os.path.abspath(path))
  248. drive, path = os.path.splitdrive(path)
  249. filepath = path.split(os.path.sep)
  250. url = '/'.join([urllib.quote(part) for part in filepath])
  251. if not drive:
  252. url = url.lstrip('/')
  253. return 'file:///' + drive + url
  254. def is_archive_file(name):
  255. """Return True if `name` is a considered as an archive file."""
  256. archives = ('.zip', '.tar.gz', '.tar.bz2', '.tgz', '.tar', '.pybundle',
  257. '.whl')
  258. ext = splitext(name)[1].lower()
  259. if ext in archives:
  260. return True
  261. return False
  262. def unpack_vcs_link(link, location, only_download=False):
  263. vcs_backend = _get_used_vcs_backend(link)
  264. if only_download:
  265. vcs_backend.export(location)
  266. else:
  267. vcs_backend.unpack(location)
  268. def _get_used_vcs_backend(link):
  269. for backend in vcs.backends:
  270. if link.scheme in backend.schemes:
  271. vcs_backend = backend(link.url)
  272. return vcs_backend
  273. def is_vcs_url(link):
  274. return bool(_get_used_vcs_backend(link))
  275. def is_file_url(link):
  276. return link.url.lower().startswith('file:')
  277. def _check_hash(download_hash, link):
  278. if download_hash.digest_size != hashlib.new(link.hash_name).digest_size:
  279. logger.fatal("Hash digest size of the package %d (%s) doesn't match the expected hash name %s!"
  280. % (download_hash.digest_size, link, link.hash_name))
  281. raise HashMismatch('Hash name mismatch for package %s' % link)
  282. if download_hash.hexdigest() != link.hash:
  283. logger.fatal("Hash of the package %s (%s) doesn't match the expected hash %s!"
  284. % (link, download_hash.hexdigest(), link.hash))
  285. raise HashMismatch('Bad %s hash for package %s' % (link.hash_name, link))
  286. def _get_hash_from_file(target_file, link):
  287. try:
  288. download_hash = hashlib.new(link.hash_name)
  289. except (ValueError, TypeError):
  290. logger.warn("Unsupported hash name %s for package %s" % (link.hash_name, link))
  291. return None
  292. fp = open(target_file, 'rb')
  293. while True:
  294. chunk = fp.read(4096)
  295. if not chunk:
  296. break
  297. download_hash.update(chunk)
  298. fp.close()
  299. return download_hash
  300. def _download_url(resp, link, temp_location):
  301. fp = open(temp_location, 'wb')
  302. download_hash = None
  303. if link.hash and link.hash_name:
  304. try:
  305. download_hash = hashlib.new(link.hash_name)
  306. except ValueError:
  307. logger.warn("Unsupported hash name %s for package %s" % (link.hash_name, link))
  308. try:
  309. total_length = int(resp.headers['content-length'])
  310. except (ValueError, KeyError, TypeError):
  311. total_length = 0
  312. downloaded = 0
  313. show_progress = total_length > 40 * 1000 or not total_length
  314. show_url = link.show_url
  315. try:
  316. if show_progress:
  317. ## FIXME: the URL can get really long in this message:
  318. if total_length:
  319. logger.start_progress('Downloading %s (%s): ' % (show_url, format_size(total_length)))
  320. else:
  321. logger.start_progress('Downloading %s (unknown size): ' % show_url)
  322. else:
  323. logger.notify('Downloading %s' % show_url)
  324. logger.info('Downloading from URL %s' % link)
  325. def resp_read(chunk_size):
  326. try:
  327. # Special case for urllib3.
  328. try:
  329. for chunk in resp.raw.stream(
  330. chunk_size, decode_content=False):
  331. yield chunk
  332. except IncompleteRead as e:
  333. raise ChunkedEncodingError(e)
  334. except AttributeError:
  335. # Standard file-like object.
  336. while True:
  337. chunk = resp.raw.read(chunk_size)
  338. if not chunk:
  339. break
  340. yield chunk
  341. for chunk in resp_read(4096):
  342. downloaded += len(chunk)
  343. if show_progress:
  344. if not total_length:
  345. logger.show_progress('%s' % format_size(downloaded))
  346. else:
  347. logger.show_progress('%3i%% %s' % (100 * downloaded / total_length, format_size(downloaded)))
  348. if download_hash is not None:
  349. download_hash.update(chunk)
  350. fp.write(chunk)
  351. fp.close()
  352. finally:
  353. if show_progress:
  354. logger.end_progress('%s downloaded' % format_size(downloaded))
  355. return download_hash
  356. def _copy_file(filename, location, content_type, link):
  357. copy = True
  358. download_location = os.path.join(location, link.filename)
  359. if os.path.exists(download_location):
  360. response = ask_path_exists(
  361. 'The file %s exists. (i)gnore, (w)ipe, (b)ackup ' %
  362. display_path(download_location), ('i', 'w', 'b'))
  363. if response == 'i':
  364. copy = False
  365. elif response == 'w':
  366. logger.warn('Deleting %s' % display_path(download_location))
  367. os.remove(download_location)
  368. elif response == 'b':
  369. dest_file = backup_dir(download_location)
  370. logger.warn('Backing up %s to %s'
  371. % (display_path(download_location), display_path(dest_file)))
  372. shutil.move(download_location, dest_file)
  373. if copy:
  374. shutil.copy(filename, download_location)
  375. logger.notify('Saved %s' % display_path(download_location))
  376. def unpack_http_url(link, location, download_cache, download_dir=None,
  377. session=None):
  378. if session is None:
  379. session = PipSession()
  380. temp_dir = tempfile.mkdtemp('-unpack', 'pip-')
  381. temp_location = None
  382. target_url = link.url.split('#', 1)[0]
  383. already_cached = False
  384. cache_file = None
  385. cache_content_type_file = None
  386. download_hash = None
  387. # If a download cache is specified, is the file cached there?
  388. if download_cache:
  389. cache_file = os.path.join(download_cache,
  390. urllib.quote(target_url, ''))
  391. cache_content_type_file = cache_file + '.content-type'
  392. already_cached = (
  393. os.path.exists(cache_file) and
  394. os.path.exists(cache_content_type_file)
  395. )
  396. if not os.path.isdir(download_cache):
  397. create_download_cache_folder(download_cache)
  398. # If a download dir is specified, is the file already downloaded there?
  399. already_downloaded = None
  400. if download_dir:
  401. already_downloaded = os.path.join(download_dir, link.filename)
  402. if not os.path.exists(already_downloaded):
  403. already_downloaded = None
  404. # If already downloaded, does it's hash match?
  405. if already_downloaded:
  406. temp_location = already_downloaded
  407. content_type = mimetypes.guess_type(already_downloaded)[0]
  408. logger.notify('File was already downloaded %s' % already_downloaded)
  409. if link.hash:
  410. download_hash = _get_hash_from_file(temp_location, link)
  411. try:
  412. _check_hash(download_hash, link)
  413. except HashMismatch:
  414. logger.warn(
  415. 'Previously-downloaded file %s has bad hash, '
  416. 're-downloading.' % temp_location
  417. )
  418. temp_location = None
  419. os.unlink(already_downloaded)
  420. already_downloaded = None
  421. # If not a valid download, let's confirm the cached file is valid
  422. if already_cached and not temp_location:
  423. with open(cache_content_type_file) as fp:
  424. content_type = fp.read().strip()
  425. temp_location = cache_file
  426. logger.notify('Using download cache from %s' % cache_file)
  427. if link.hash and link.hash_name:
  428. download_hash = _get_hash_from_file(cache_file, link)
  429. try:
  430. _check_hash(download_hash, link)
  431. except HashMismatch:
  432. logger.warn(
  433. 'Cached file %s has bad hash, '
  434. 're-downloading.' % temp_location
  435. )
  436. temp_location = None
  437. os.unlink(cache_file)
  438. os.unlink(cache_content_type_file)
  439. already_cached = False
  440. # We don't have either a cached or a downloaded copy
  441. # let's download to a tmp dir
  442. if not temp_location:
  443. try:
  444. resp = session.get(target_url, stream=True)
  445. resp.raise_for_status()
  446. except requests.HTTPError as exc:
  447. logger.fatal("HTTP error %s while getting %s" %
  448. (exc.response.status_code, link))
  449. raise
  450. content_type = resp.headers.get('content-type', '')
  451. filename = link.filename # fallback
  452. # Have a look at the Content-Disposition header for a better guess
  453. content_disposition = resp.headers.get('content-disposition')
  454. if content_disposition:
  455. type, params = cgi.parse_header(content_disposition)
  456. # We use ``or`` here because we don't want to use an "empty" value
  457. # from the filename param.
  458. filename = params.get('filename') or filename
  459. ext = splitext(filename)[1]
  460. if not ext:
  461. ext = mimetypes.guess_extension(content_type)
  462. if ext:
  463. filename += ext
  464. if not ext and link.url != resp.url:
  465. ext = os.path.splitext(resp.url)[1]
  466. if ext:
  467. filename += ext
  468. temp_location = os.path.join(temp_dir, filename)
  469. download_hash = _download_url(resp, link, temp_location)
  470. if link.hash and link.hash_name:
  471. _check_hash(download_hash, link)
  472. # a download dir is specified; let's copy the archive there
  473. if download_dir and not already_downloaded:
  474. _copy_file(temp_location, download_dir, content_type, link)
  475. # unpack the archive to the build dir location. even when only downloading
  476. # archives, they have to be unpacked to parse dependencies
  477. unpack_file(temp_location, location, content_type, link)
  478. # if using a download cache, cache it, if needed
  479. if cache_file and not already_cached:
  480. cache_download(cache_file, temp_location, content_type)
  481. if not (already_cached or already_downloaded):
  482. os.unlink(temp_location)
  483. os.rmdir(temp_dir)
  484. def unpack_file_url(link, location, download_dir=None):
  485. link_path = url_to_path(link.url_without_fragment)
  486. already_downloaded = False
  487. # If it's a url to a local directory
  488. if os.path.isdir(link_path):
  489. if os.path.isdir(location):
  490. rmtree(location)
  491. shutil.copytree(link_path, location, symlinks=True)
  492. return
  493. # if link has a hash, let's confirm it matches
  494. if link.hash:
  495. link_path_hash = _get_hash_from_file(link_path, link)
  496. _check_hash(link_path_hash, link)
  497. # If a download dir is specified, is the file already there and valid?
  498. if download_dir:
  499. download_path = os.path.join(download_dir, link.filename)
  500. if os.path.exists(download_path):
  501. content_type = mimetypes.guess_type(download_path)[0]
  502. logger.notify('File was already downloaded %s' % download_path)
  503. if link.hash:
  504. download_hash = _get_hash_from_file(download_path, link)
  505. try:
  506. _check_hash(download_hash, link)
  507. already_downloaded = True
  508. except HashMismatch:
  509. logger.warn(
  510. 'Previously-downloaded file %s has bad hash, '
  511. 're-downloading.' % link_path
  512. )
  513. os.unlink(download_path)
  514. else:
  515. already_downloaded = True
  516. if already_downloaded:
  517. from_path = download_path
  518. else:
  519. from_path = link_path
  520. content_type = mimetypes.guess_type(from_path)[0]
  521. # unpack the archive to the build dir location. even when only downloading
  522. # archives, they have to be unpacked to parse dependencies
  523. unpack_file(from_path, location, content_type, link)
  524. # a download dir is specified and not already downloaded
  525. if download_dir and not already_downloaded:
  526. _copy_file(from_path, download_dir, content_type, link)