url.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. import time
  2. import threading
  3. import httplib
  4. import urllib
  5. import urllib2
  6. import getpass
  7. import re
  8. import os
  9. import sys
  10. import cookielib
  11. import zlib
  12. import gzip
  13. import socket
  14. import ssl
  15. from cStringIO import StringIO
  16. import multipart
  17. BLOCK_SIZE = 64 * 1024
  18. def decode (page):
  19. """gunzip or deflate a compressed page
  20. """
  21. encoding = page.info().get("Content-Encoding")
  22. if encoding in ('gzip', 'x-gzip', 'deflate'):
  23. # cannot seek in socket descriptors, so must get content now
  24. content = page.read()
  25. if encoding == 'deflate':
  26. fp = StringIO(zlib.decompress(content))
  27. else:
  28. fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
  29. # remove content-encoding header
  30. headers = httplib.HTTPMessage(StringIO(""))
  31. ceheader = re.compile(r"(?i)content-encoding:")
  32. for h in page.info().keys():
  33. if not ceheader.match(h):
  34. headers[h] = page.info()[h]
  35. newpage = urllib.addinfourl(fp, headers, page.geturl())
  36. # Propagate code, msg through
  37. if hasattr(page, 'code'):
  38. newpage.code = page.code
  39. if hasattr(page, 'msg'):
  40. newpage.msg = page.msg
  41. return newpage
  42. return page
  43. class HttpWithGzipHandler(urllib2.HTTPHandler):
  44. """http with gzip encoding
  45. """
  46. def http_open (self, req):
  47. return decode(urllib2.HTTPHandler.http_open(self, req))
  48. class HttpsWithGzipHandler(urllib2.HTTPSHandler):
  49. """https with gzip encoding
  50. """
  51. def https_open (self, req):
  52. return decode(urllib2.HTTPSHandler.https_open(self, req))
  53. class handlepasswd(urllib2.HTTPPasswordMgrWithDefaultRealm):
  54. def find_user_password(self, realm, authurl):
  55. user, password = urllib2.HTTPPasswordMgrWithDefaultRealm.find_user_password(self, realm, authurl)
  56. if user is not None:
  57. return user, password
  58. user = raw_input('Enter username for %s at %s: ' % (realm, authurl))
  59. password = getpass.getpass(
  60. "Enter password for %s in %s at %s: " % (user, realm, authurl))
  61. self.add_password(realm, authurl, user, password)
  62. return user, password
  63. def _printProgress(read, max, kbcurr, kbtotal):
  64. print '%.2f of %.2f MB [%.1f%%] downloaded [%.2f kb sec]' % (read / 1024.0 ** 2, max / 1024.0 ** 2, 100.0 * read / max, kbtotal)
  65. def readWithProgress(orgOpenFunc):
  66. epsilon = 0.00000001
  67. def _inner(*args, **kwargs):
  68. '''returns response data as string if no "file" kwargs is given else
  69. write response data to file
  70. @param file: filelike that is used with write()
  71. @type file: filelike
  72. @param cb: callback functiton with signature (currentCount, maxCount, kbsecCurrent, kbsecTotal)
  73. @type cp: callable
  74. @param cbcount: how many times callback should be triggered
  75. @type cbcount: int
  76. '''
  77. block_size = BLOCK_SIZE
  78. request = orgOpenFunc(*args)
  79. progressCallback = kwargs.get('cb', _printProgress)
  80. #make 10 callbacks default
  81. callbackCount = kwargs.get('cbcount', 10)
  82. callbackStepWide = 0
  83. callbackStep = 0
  84. data = kwargs.get('file', StringIO())
  85. headers = request.info()
  86. max = int(headers.get('Content-Length', -1))
  87. if max > 0:
  88. callbackStep = callbackStepWide = float(max) / callbackCount
  89. read = 0
  90. start = time.time()
  91. while True:
  92. t = time.time()
  93. s = request.read(block_size)
  94. kbsecCurrent = block_size / (time.time() - t + epsilon) / 1024.0
  95. if s == '':
  96. break
  97. data.write(s)
  98. read += block_size
  99. kbsecTotal = read / (time.time() - start + epsilon) / 1024.0
  100. if callbackStepWide > 0:
  101. if read > callbackStep:
  102. progressCallback(read, max, kbsecCurrent, kbsecTotal)
  103. callbackStep += callbackStepWide
  104. else:
  105. progressCallback(read, max, kbsecCurrent, kbsecTotal)
  106. request.close()
  107. if max >= 0 and read < max:
  108. raise Exception("retrieval incomplete: got only %i out "
  109. "of %i bytes" % (read, max))
  110. if not 'file' in kwargs:
  111. return data.getvalue()
  112. return _inner
  113. def getOpener(proxies=None, authfunc=None, enableGzip=False,
  114. headers=None, cookieInfo=None,
  115. enableCookies=True, enableMultipart=False,
  116. ):
  117. pwd_manager = handlepasswd()
  118. handlers = [
  119. urllib2.UnknownHandler(),
  120. urllib2.HTTPBasicAuthHandler(pwd_manager),
  121. urllib2.ProxyBasicAuthHandler(pwd_manager),
  122. urllib2.HTTPDigestAuthHandler(pwd_manager),
  123. urllib2.ProxyDigestAuthHandler(pwd_manager),
  124. urllib2.HTTPDefaultErrorHandler(),
  125. urllib2.HTTPRedirectHandler(),
  126. ]
  127. if proxies == 'auto':
  128. proxies = urllib.getproxies()
  129. if proxies is not None:
  130. # empty values like {'http': ''} will lead to URLError: <urlopen error no host given>
  131. if '' in proxies.values():
  132. for k in proxies.keys():
  133. if proxies[k] == '':
  134. proxies.pop(k)
  135. proxyHandler = urllib2.ProxyHandler(proxies)
  136. else:
  137. #disable all proxies - if we would pass None, the ProxyHandler would autoetect Proxies
  138. proxyHandler = urllib2.ProxyHandler({})
  139. handlers.append(proxyHandler)
  140. if enableMultipart:
  141. handlers.append(multipart.MultipartPostHandler)
  142. if enableCookies:
  143. cj = cookielib.CookieJar()
  144. if cookieInfo is not None:
  145. cj.set_cookie(cookielib.Cookie(
  146. cookieInfo.get('version', None), cookieInfo['name'], cookieInfo['value'],
  147. cookieInfo.get('port', None), None,
  148. cookieInfo['domain'], None, None,
  149. cookieInfo.get('path', '/'), None,
  150. cookieInfo.get('secure', False),
  151. cookieInfo.get('expires', None),
  152. False,
  153. '',
  154. '',
  155. {}))
  156. handlers.append(urllib2.HTTPCookieProcessor(cj))
  157. if enableGzip:
  158. handlers.append(HttpWithGzipHandler())
  159. opener = urllib2.build_opener(*handlers)
  160. if headers is None:
  161. headers = dict(opener.addheaders)
  162. else:
  163. tmp = dict(opener.addheaders)
  164. tmp.update(headers)
  165. if enableGzip:
  166. headers['Accept-Encoding'] = 'gzip;q=1.0, deflate;q=0.9, identity;q=0.5'
  167. #add an additional function
  168. opener.readWithProgress = readWithProgress(opener.open)
  169. opener.addheaders = headers.items()
  170. if authfunc is not None:
  171. authfunc(opener)
  172. # print _global_opener.handlers
  173. #~ urllib2.install_opener(_global_opener)
  174. return opener
  175. def urlopen(url, proxies=None, data=None, authfunc=None, enableGzip=True, useragent=None):
  176. ''' create a new opener, open request object and return an addinfourl object with an open filepointer (use .read())'''
  177. headers = {}
  178. if enableGzip:
  179. headers['Accept-Encoding'] = 'gzip;q=1.0, deflate;q=0.9, identity;q=0.5'
  180. if useragent is not None:
  181. headers['User-Agent'] = useragent
  182. req = urllib2.Request(url, data, headers)
  183. opener = getOpener(proxies=proxies, authfunc=authfunc, enableGzip=enableGzip)
  184. if authfunc is not None:
  185. authfunc(opener)
  186. return opener.open(req)
  187. def authfunc(url, user, password):
  188. '''return a configured authfunc for a vls interface'''
  189. def _inner(opener):
  190. data = urllib.urlencode([('loginUser', user),
  191. ('loginPassword', password),
  192. ('login', ''),
  193. ('nohttps', ''),
  194. ])
  195. res = opener.open(url + '/auth/login', data=data).read()
  196. if res != 'successful':
  197. raise Exception('could not authentificate user "%s"\n----%s\n----' % (user, res[:100]))
  198. return _inner