# -*- coding: utf-8 -*-
"""
#########################################
walkscore.http_client
#########################################
Implements a standardized interface for requesting URLs from the internet and
returning the response.
"""
import sys
import warnings
import email
import time
import threading
import json
import io
# - Requests is the preferred HTTP library
# - Google App Engine has urlfetch
# - Use Pycurl if it's there (at least it verifies SSL certs)
# - Fall back to urllib2 with a warning if needed
try:
import urllib2 as urllib
except ImportError:
# Try to load in urllib2, but don't sweat it if it's not available.
pass
try:
import pycurl
except ImportError:
pycurl = None
try:
import requests
except ImportError:
requests = None
else:
try:
# Require version 0.8.8, but don't want to depend on distutils
version = requests.__version__
major, minor, patch = [int(i) for i in version.split(".")]
except Exception: # pylint: disable=W0703
# Probably some new-fangled version, so it should support verify
pass
else:
if (major, minor, patch) < (0, 8, 8):
sys.stderr.write(
"Warning: the WalkScore library requires that your Python "
'"requests" library be newer than version 0.8.8, but your '
'"requests" library is version %s. WalkScore will fall back to '
"an alternate HTTP library so everything should work. We "
'recommend upgrading your "requests" library. If you have any '
"questions, please contact software@insightindustry.com. (HINT: running "
'"pip install -U requests" should upgrade your requests '
"library to the latest version.)" % (version,)
)
requests = None
try:
from google.appengine.api import urlfetch
except ImportError:
urlfetch = None
# proxy support for the pycurl client
import six
from six.moves.urllib.parse import urlparse
from backoff_utils import backoff
from backoff_utils import strategies as backoff_strategies
from validator_collection import validators
from walkscore.utilities import CA_BUNDLE_PATH, to_utf8
from walkscore.errors import check_for_errors, HTTPTimeoutError, SSLError, \
WalkScoreError, BindingError, HTTPConnectionError
HTTP_METHODS = ['GET',
'HEAD',
'OPTIONS',
'POST',
'PUT',
'PATCH',
'DELETE']
def _now_ms():
"""Returns the current time expressed in milliseconds.
:rtype: :class:`int <python:int>`
"""
return int(round(time.time() * 1000))
def default_http_client(*args, **kwargs):
"""Return a default HTTP Client.
:rtype: :class:`HTTPClient`
"""
if urlfetch:
impl = UrlFetchClient
elif requests:
impl = RequestsClient
elif pycurl:
impl = PycurlClient
else:
impl = Urllib2Client
warnings.warn(
"Warning: the WalkScore library is falling back to urllib2/urllib "
"because neither requests nor pycurl are installed. "
"urllib2's SSL implementation doesn't verify server "
"certificates. For improved security, we suggest installing "
"requests."
)
return impl(*args, **kwargs)
[docs]class HTTPClient(object): # pylint: disable=R0205
"""Base class that provides HTTP connectivity."""
MAX_DELAY = 2
INITIAL_DELAY = 0.5
def __init__(self,
verify_ssl_certs = True,
proxy = None):
self._verify_ssl_certs = verify_ssl_certs
if proxy:
if isinstance(proxy, str):
proxy = {
"http": proxy,
"https": proxy
}
if not isinstance(proxy, dict):
raise ValueError(
"Proxy(ies) must be specified as either a string "
"URL or a dict() with string URL under the"
" "
"https"
" and/or "
"http"
" keys."
)
if proxy:
self._proxy = proxy.copy()
else:
self._proxy = None
self._thread_local = threading.local()
[docs] def request_with_retries(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
"""Execute a standard HTTP request with automatic retries on failure.
:param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
`POST`, `PATCH`, `PUT`, or `DELETE`.
:type method: :class:`str <python:str>`
:param url: The URL to execute the request against.
:type url: :class:`str <python:str>`
:param parameters: URL parameters to submit with the request. Defaults to
:obj:`None <python:None>`.
:type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param headers: HTTP headers to submit with the request. Defaults to
:obj:`None <python:None>`.
:type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param request_body: The data to supply in the body of the request. Defaults to
:obj:`None <python:None>`.
:type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
:class:`str <python:str>` / :class:`bytes <python:bytes>`
.. note::
This method will apply an
`exponential backoff strategy <https://en.wikipedia.org/wiki/Exponential_backoff>`_
to retry the API request if it times out. By default:
* requests that can be retried will be retried up to ``3`` times, but this can
be overridden by setting a ``BACKOFF_DEFAULT_TRIES`` environment variable with
the maximum number of attempts to make
* there is no maximum delay to wait before final failure, but this can be
overridden by setting a ``BACKOFF_DEFAULT_DELAY`` environment variable with
the maximum number of seconds to wait (across all attempts) before failing.
:raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
``PATCH``, ``PUT`` or ``DELETE``
:raises ValueError: if ``url`` is not a valid URL
:raises HTTPTimeoutError: if the request times out after repeated attempts
:raises SSLError: if the request fails SSL certificate verification
:raises WalkScoreError: *or sub-classes* for other errors returned by the API
"""
response = backoff(self.request,
args = [method, url, parameters, headers, request_body],
catch_exceptions = [type(HTTPTimeoutError)],
strategy = backoff_strategies.Exponential)
return response
def _request(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
"""Execute a standard HTTP request.
:param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
`POST`, `PATCH`, `PUT`, or `DELETE`.
:type method: :class:`str <python:str>`
:param url: The URL to execute the request against.
:type url: :class:`str <python:str>`
:param parameters: URL parameters to submit with the request. Defaults to
:obj:`None <python:None>`.
:type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param headers: HTTP headers to submit with the request. Defaults to
:obj:`None <python:None>`.
:type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param request_body: The data to supply in the body of the request. Defaults to
:obj:`None <python:None>`.
:type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
:class:`str <python:str>` / :class:`bytes <python:bytes>`
:returns: The content of the HTTP response, the status code of the HTTP response,
and the headers of the HTTP response.
:rtype: :class:`tuple <python:tuple>` of :class:`bytes <python:bytes>`,
:class:`int <python:int>`, and :class:`dict <python:dict>`
:raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
``PATCH``, ``PUT`` or ``DELETE``
:raises ValueError: if ``url`` is not a valid URL
:raises HTTPTimeoutError: if the request times out
:raises SSLError: if the request fails SSL certificate verification
:raises WalkScoreError: *or sub-classes* for other errors returned by the API
"""
raise NotImplementedError(
"HTTPClient subclasses must implement `_request`"
)
[docs] def request(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
"""Execute a standard HTTP request.
:param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
`POST`, `PATCH`, `PUT`, or `DELETE`.
:type method: :class:`str <python:str>`
:param url: The URL to execute the request against.
:type url: :class:`str <python:str>`
:param parameters: URL parameters to submit with the request. Defaults to
:obj:`None <python:None>`.
:type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param headers: HTTP headers to submit with the request. Defaults to
:obj:`None <python:None>`.
:type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param request_body: The data to supply in the body of the request. Defaults to
:obj:`None <python:None>`.
:type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
:class:`str <python:str>` / :class:`bytes <python:bytes>`
:returns: The content of the HTTP response, the status code of the HTTP response,
and the headers of the HTTP response.
:rtype: :class:`tuple <python:tuple>` of :class:`bytes <python:bytes>`,
:class:`int <python:int>`, and :class:`dict <python:dict>`
:raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
``PATCH``, ``PUT`` or ``DELETE``
:raises ValueError: if ``url`` is not a valid URL
:raises ValueError: if ``headers`` is not empty and is not a
:class:`dict <python:dict>`
:raises HTTPTimeoutError: if the request times out
:raises SSLError: if the request fails SSL certificate verification
:raises WalkScoreError: *or sub-classes* for other errors returned by the API
"""
method = validators.string(method, allow_empty = False)
method = method.upper()
if method not in HTTP_METHODS:
raise ValueError('method (%s) not a recognized HTTP method' % method)
url = validators.url(url, allow_empty = False)
parameters = validators.dict(parameters, allow_empty = True)
headers = validators.dict(headers, allow_empty = True)
content, status_code, headers = self._request(method,
url,
parameters,
headers,
request_body)
check_for_errors(status_code, content)
return content, status_code, headers
[docs] def close(self):
"""Closes an existing HTTP connection/session."""
raise NotImplementedError(
"HTTPClient subclasses must implement `close`"
)
class RequestsClient(HTTPClient):
""":class:`HTTPClient` for the :doc:`requests <requests:index>` library."""
name = "requests"
def __init__(self,
timeout = 80,
session = None,
**kwargs):
super(RequestsClient, self).__init__(**kwargs)
self._session = session
self._timeout = timeout
def _request(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
"""Execute a standard HTTP request.
:param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
`POST`, `PATCH`, `PUT`, or `DELETE`.
:type method: :class:`str <python:str>`
:param url: The URL to execute the request against.
:type url: :class:`str <python:str>`
:param parameters: URL parameters to submit with the request. Defaults to
:obj:`None <python:None>`.
:type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param headers: HTTP headers to submit with the request. Defaults to
:obj:`None <python:None>`.
:type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`
:param request_body: The data to supply in the body of the request. Defaults to
:obj:`None <python:None>`.
:type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
:class:`str <python:str>` / :class:`bytes <python:bytes>`
:returns: The content of the HTTP response, the status code of the HTTP response,
and the headers of the HTTP response.
:rtype: :class:`tuple <python:tuple>` of :class:`bytes <python:bytes>`,
:class:`int <python:int>`, and :class:`dict <python:dict>`
:raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
``PATCH``, ``PUT`` or ``DELETE``
:raises ValueError: if ``url`` is not a valid URL
:raises HTTPTimeoutError: if the request times out
:raises SSLError: if the request fails SSL certificate verification
:raises WalkScoreError: *or sub-classes* for other errors returned by the API
"""
kwargs = {}
if self._verify_ssl_certs:
kwargs["verify"] = CA_BUNDLE_PATH
else:
kwargs["verify"] = False
if self._proxy:
kwargs["proxies"] = self._proxy
if getattr(self._thread_local, "session", None) is None:
self._thread_local.session = self._session or requests.Session()
try:
try:
result = self._thread_local.session.request(method,
url,
params = parameters,
headers = headers,
data = request_body,
timeout = self._timeout,
**kwargs)
except TypeError as error:
raise TypeError(
"Warning: It looks like your installed version of the "
'"requests" library is not compatible with WalkScore\'s '
"usage thereof. (HINT: The most likely cause is that "
'your "requests" library is out of date. You can fix '
'that by running "pip install -U requests".) The '
"underlying error was: %s" % (error,)
)
# This causes the content to actually be read, which could cause
# e.g. a socket timeout. TODO: The other fetch methods probably
# are susceptible to the same and should be updated.
content = result.content
status_code = result.status_code
except Exception as error: # pylint: disable=W0703
# Would catch just requests.exceptions.RequestException, but can
# also raise ValueError, RuntimeError, etc.
WalkScoreError.from_exception(error)
return content, status_code, result.headers
def close(self):
"""Closes an existing HTTP connection/session."""
if getattr(self._thread_local, "session", None) is not None:
self._thread_local.session.close()
class UrlFetchClient(HTTPClient):
"""class:`HTTPClient` for the :doc:`urlfetch <urlfetch:index>` library."""
name = "urlfetch"
def __init__(self,
verify_ssl_certs = True,
proxy = None,
deadline = 55):
super(UrlFetchClient, self).__init__(verify_ssl_certs = verify_ssl_certs,
proxy = proxy)
# no proxy support in urlfetch. for a patch, see:
# https://code.google.com/p/googleappengine/issues/detail?id=544
if proxy:
raise ValueError(
"No proxy support in urlfetch library. "
"Set walkscore.default_http_client to either RequestsClient, "
"PycurlClient, or Urllib2Client instance to use a proxy."
)
self._verify_ssl_certs = verify_ssl_certs
# GAE requests time out after 60 seconds, so make sure to default
# to 55 seconds to allow for a slow WalkScore API
self._deadline = deadline
def _request(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
try:
result = urlfetch.request(
url = url,
method = method,
params = parameters,
headers = headers,
# Google App Engine doesn't let us specify our own cert bundle.
# However, that's ok because the CA bundle they use recognizes
# api.stripe.com.
validate_certificate = self._verify_ssl_certs,
deadline = self._deadline,
data = request_body,
)
except urlfetch.Error as error:
if isinstance(error, urlfetch.InvalidURLError):
raise BindingError(
"The WalkScore library attempted to fetch an "
"invalid URL (%r). This is likely due to a bug "
"in the WalkScore Python bindings. Please let us know "
"at software@insightindustry.com." % (url,)
)
elif isinstance(error, urlfetch.DownloadError):
message = "There was a problem retrieving data from WalkScore."
elif isinstance(error, urlfetch.ResponseTooLargeError):
message = (
"There was a problem receiving all of your data from "
"WalkScore. This is likely due to a bug in WalkScore. "
)
else:
message = (
"Unexpected error communicating with WalkScore. If this "
"problem persists, let us know at software@insightindustry.com."
)
raise InternalAPIError(message)
return result.content, result.status_code, result.headers
def close(self):
pass
class PycurlClient(HTTPClient):
"""class:`HTTPClient` for the `pycurl <http://pycurl.io/docs/latest/index.html>`_
library.
"""
name = "pycurl"
def __init__(self,
verify_ssl_certs = True,
proxy = None):
super(PycurlClient, self).__init__(verify_ssl_certs = verify_ssl_certs,
proxy = proxy)
# Initialize this within the object so that we can reuse connections.
self._curl = pycurl.Curl()
# need to urlparse the proxy, since PyCurl
# consumes the proxy url in small pieces
if self._proxy:
# now that we have the parser, get the proxy url pieces
for scheme in self._proxy:
self._proxy[scheme] = urlparse(self._proxy[scheme])
def parse_headers(self, data): # pylint: disable=R0201
"""Parse headers into a :class:`dict <python:dict>`
:param data: A string-like object with header data.
:type data: :class:`bytes <python:bytes>` / :class:`str <python:str>`
:returns: Dictionary of HTTP headers.
:rtype: :class:`dict <python:dict>`
"""
if "\r\n" not in data:
return {}
raw_headers = data.split("\r\n", 1)[1]
headers = email.message_from_string(raw_headers)
return dict((k.lower(), v) for k, v in six.iteritems(dict(headers)))
def _request(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
if isinstance(request_body, dict):
request_body = json.dumps(request_body)
b = io.BytesIO()
rheaders = io.BytesIO()
# Pycurl's design is a little weird: although we set per-request
# options on this object, it's also capable of maintaining established
# connections. Here we call reset() between uses to make sure it's in a
# pristine state, but notably reset() doesn't reset connections, so we
# still get to take advantage of those by virtue of re-using the same
# object.
self._curl.reset()
proxy = self._get_proxy(url)
if proxy:
if proxy.hostname:
self._curl.setopt(pycurl.PROXY, proxy.hostname)
if proxy.port:
self._curl.setopt(pycurl.PROXYPORT, proxy.port)
if proxy.username or proxy.password:
self._curl.setopt(
pycurl.PROXYUSERPWD,
"%s:%s" % (proxy.username, proxy.password),
)
if method == "GET":
self._curl.setopt(pycurl.HTTPGET, 1)
elif method == 'HEAD':
self._curl.setopt(pycurl.NOBODY, 1)
elif method == "POST":
self._curl.setopt(pycurl.POST, 1)
self._curl.setopt(pycurl.POSTFIELDS, request_body)
elif method == 'PUT':
self._curl.setopt(pycurl.CUSTOMREQUEST, 'PUT')
self._curl.setopt(pycurl.POSTFIELDS, request_body)
elif method == 'PATCH':
self._curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
self._curl.setopt(pycurl.POSTFIELDS, request_body)
else:
self._curl.setopt(pycurl.CUSTOMREQUEST, method.upper())
# pycurl doesn't like unicode URLs
if parameters:
parameter_string = base_urllib.urlencode(parameters)
url += '?' + parameter_string
url = to_utf8(url)
self._curl.setopt(pycurl.URL, url)
self._curl.setopt(pycurl.WRITEFUNCTION, b.write)
self._curl.setopt(pycurl.HEADERFUNCTION, rheaders.write)
self._curl.setopt(pycurl.NOSIGNAL, 1)
self._curl.setopt(pycurl.CONNECTTIMEOUT, 30)
self._curl.setopt(pycurl.TIMEOUT, 80)
if headers:
self._curl.setopt(
pycurl.HTTPHEADER,
["%s: %s" % (k, v) for k, v in six.iteritems(dict(headers))],
)
if self._verify_ssl_certs:
self._curl.setopt(pycurl.CAINFO, CA_BUNDLE_PATH)
else:
self._curl.setopt(pycurl.SSL_VERIFYHOST, False)
try:
self._curl.perform()
except pycurl.error as error:
self._handle_request_error(error)
rbody = b.getvalue().decode("utf-8")
rcode = self._curl.getinfo(pycurl.RESPONSE_CODE)
headers = self.parse_headers(rheaders.getvalue().decode("utf-8"))
return rbody, rcode, headers
@classmethod
def _handle_request_error(cls, error):
if error.args[0] == pycurl.E_OPERATION_TIMEOUTED:
raise HTTPTimeoutError('Could not connect to the WalkScore API. '
'Please check your internet connection and try again. ')
elif error.args[0] == [pycurl.E_COULDNT_CONNECT,
pycurl.E_COULDNT_RESOLVE_HOST]:
raise HTTPConnectionError("Could not connect to WalkScore. Please check "
"your internet connection and try again.")
elif error.args[0] in [pycurl.E_SSL_CACERT,
pycurl.E_SSL_PEER_CERTIFICATE]:
raise SSLError("Could not verify WalkScore's SSL certificate. Please make "
"sure that your network is not intercepting certificates.")
else:
raise HTTPConnectionError(
"Unexpected error communicating with the WalkScore API. If this "
"problem persists, let us know at software@insightindustry.com."
)
def _get_proxy(self, url):
if self._proxy:
proxy = self._proxy
scheme = url.split(":")[0] if url else None
if scheme:
if scheme in proxy:
return proxy[scheme]
scheme = scheme[0:-1]
if scheme in proxy:
return proxy[scheme]
return None
def close(self):
pass
class Urllib2Client(HTTPClient):
"""class:`HTTPClient` for the :doc:`urllib2` package.
"""
name = "urllib.request"
def __init__(self,
verify_ssl_certs = True,
proxy = None):
super(Urllib2Client, self).__init__(verify_ssl_certs = verify_ssl_certs,
proxy = proxy)
# prepare and cache proxy tied opener here
self._opener = None
if self._proxy:
proxy = urllib.request.ProxyHandler(self._proxy)
self._opener = urllib.request.build_opener(proxy)
def _request(self,
method,
url,
parameters = None,
headers = None,
request_body = None):
request_body = to_utf8(request_body)
if parameters:
parameter_string = base_urllib.urlencode(parameters)
url += '?' + parameter_string
request = urllib.request.Request(url, request_body, headers)
if method not in ("GET", "POST"):
request.get_method = method
try:
# use the custom proxy tied opener, if any.
# otherwise, fall to the default urllib opener.
if self._opener:
response = self._opener.open(request)
else:
response = urllib.request.urlopen(request)
rbody = response.read()
rcode = response.code
response_headers = dict(response.info())
if hasattr(response, 'close'):
response.close()
except urllib.error.HTTPError as error:
rcode = error.code
rbody = error.read()
response_headers = dict(error.info())
if hasattr(error, 'close'):
error.close()
except (urllib.error.URLError, ValueError) as error:
WalkScoreError.from_exception(error)
response_headers = dict((k.lower(), v) for k, v
in six.iteritems(dict(response_headers)))
return rbody, rcode, response_headers
def close(self):
pass