Source code for walkscore.http_client

# -*- coding: utf-8 -*-

"""
#########################################
walkscore.http_client
#########################################

Implements a standardized interface for requesting URLs from the internet and
returning the response.

"""
import sys
import warnings
import email
import time
import threading
import json
import io

# - Requests is the preferred HTTP library
# - Google App Engine has urlfetch
# - Use Pycurl if it's there (at least it verifies SSL certs)
# - Fall back to urllib2 with a warning if needed
try:
    import urllib2 as urllib
except ImportError:
    # Try to load in urllib2, but don't sweat it if it's not available.
    pass

try:
    import pycurl
except ImportError:
    pycurl = None

try:
    import requests
except ImportError:
    requests = None
else:
    try:
        # Require version 0.8.8, but don't want to depend on distutils
        version = requests.__version__
        major, minor, patch = [int(i) for i in version.split(".")]
    except Exception:                                                                     # pylint: disable=W0703
        # Probably some new-fangled version, so it should support verify
        pass
    else:
        if (major, minor, patch) < (0, 8, 8):
            sys.stderr.write(
                "Warning: the WalkScore library requires that your Python "
                '"requests" library be newer than version 0.8.8, but your '
                '"requests" library is version %s. WalkScore will fall back to '
                "an alternate HTTP library so everything should work. We "
                'recommend upgrading your "requests" library. If you have any '
                "questions, please contact software@insightindustry.com. (HINT: running "
                '"pip install -U requests" should upgrade your requests '
                "library to the latest version.)" % (version,)
            )
            requests = None

try:
    from google.appengine.api import urlfetch
except ImportError:
    urlfetch = None

# proxy support for the pycurl client
import six
from six.moves.urllib.parse import urlparse

from backoff_utils import backoff
from backoff_utils import strategies as backoff_strategies
from validator_collection import validators

from walkscore.utilities import CA_BUNDLE_PATH, to_utf8
from walkscore.errors import check_for_errors, HTTPTimeoutError, SSLError, \
    WalkScoreError, BindingError, HTTPConnectionError


HTTP_METHODS = ['GET',
                'HEAD',
                'OPTIONS',
                'POST',
                'PUT',
                'PATCH',
                'DELETE']


def _now_ms():
    """Returns the current time expressed in milliseconds.

    :rtype: :class:`int <python:int>`
    """
    return int(round(time.time() * 1000))


def default_http_client(*args, **kwargs):
    """Return a default HTTP Client.

    :rtype: :class:`HTTPClient`
    """
    if urlfetch:
        impl = UrlFetchClient
    elif requests:
        impl = RequestsClient
    elif pycurl:
        impl = PycurlClient
    else:
        impl = Urllib2Client
        warnings.warn(
            "Warning: the WalkScore library is falling back to urllib2/urllib "
            "because neither requests nor pycurl are installed. "
            "urllib2's SSL implementation doesn't verify server "
            "certificates. For improved security, we suggest installing "
            "requests."
        )

    return impl(*args, **kwargs)


[docs]class HTTPClient(object):                                                                 # pylint: disable=R0205
    """Base class that provides HTTP connectivity."""

    MAX_DELAY = 2
    INITIAL_DELAY = 0.5

    def __init__(self,
                 verify_ssl_certs = True,
                 proxy = None):
        self._verify_ssl_certs = verify_ssl_certs

        if proxy:
            if isinstance(proxy, str):
                proxy = {
                    "http": proxy,
                    "https": proxy
                }

            if not isinstance(proxy, dict):
                raise ValueError(
                    "Proxy(ies) must be specified as either a string "
                    "URL or a dict() with string URL under the"
                    " "
                    "https"
                    " and/or "
                    "http"
                    " keys."
                )

        if proxy:
            self._proxy = proxy.copy()
        else:
            self._proxy = None

        self._thread_local = threading.local()

[docs]    def request_with_retries(self,
                             method,
                             url,
                             parameters = None,
                             headers = None,
                             request_body = None):
        """Execute a standard HTTP request with automatic retries on failure.

        :param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
          `POST`, `PATCH`, `PUT`, or `DELETE`.
        :type method: :class:`str <python:str>`

        :param url: The URL to execute the request against.
        :type url: :class:`str <python:str>`

        :param parameters: URL parameters to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param headers: HTTP headers to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param request_body: The data to supply in the body of the request. Defaults to
          :obj:`None <python:None>`.
        :type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
          :class:`str <python:str>` / :class:`bytes <python:bytes>`

        .. note::

          This method will apply an
          `exponential backoff strategy <https://en.wikipedia.org/wiki/Exponential_backoff>`_
          to retry the API request if it times out. By default:

          * requests that can be retried will be retried up to ``3`` times, but this can
            be overridden by setting a ``BACKOFF_DEFAULT_TRIES`` environment variable with
            the maximum number of attempts to make
          * there is no maximum delay to wait before final failure, but this can be
            overridden by setting a ``BACKOFF_DEFAULT_DELAY`` environment variable with
            the maximum number of seconds to wait (across all attempts) before failing.

        :raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
          ``PATCH``, ``PUT`` or ``DELETE``
        :raises ValueError: if ``url`` is not a valid URL
        :raises HTTPTimeoutError: if the request times out after repeated attempts
        :raises SSLError: if the request fails SSL certificate verification
        :raises WalkScoreError: *or sub-classes* for other errors returned by the API

        """
        response = backoff(self.request,
                           args = [method, url, parameters, headers, request_body],
                           catch_exceptions = [type(HTTPTimeoutError)],
                           strategy = backoff_strategies.Exponential)

        return response

    def _request(self,
                 method,
                 url,
                 parameters = None,
                 headers = None,
                 request_body = None):
        """Execute a standard HTTP request.

        :param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
          `POST`, `PATCH`, `PUT`, or `DELETE`.
        :type method: :class:`str <python:str>`

        :param url: The URL to execute the request against.
        :type url: :class:`str <python:str>`

        :param parameters: URL parameters to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param headers: HTTP headers to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param request_body: The data to supply in the body of the request. Defaults to
          :obj:`None <python:None>`.
        :type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
          :class:`str <python:str>` / :class:`bytes <python:bytes>`

        :returns: The content of the HTTP response, the status code of the HTTP response,
          and the headers of the HTTP response.
        :rtype: :class:`tuple <python:tuple>` of :class:`bytes <python:bytes>`,
          :class:`int <python:int>`, and :class:`dict <python:dict>`

        :raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
          ``PATCH``, ``PUT`` or ``DELETE``
        :raises ValueError: if ``url`` is not a valid URL
        :raises HTTPTimeoutError: if the request times out
        :raises SSLError: if the request fails SSL certificate verification
        :raises WalkScoreError: *or sub-classes* for other errors returned by the API

        """
        raise NotImplementedError(
            "HTTPClient subclasses must implement `_request`"
        )


[docs]    def request(self,
                method,
                url,
                parameters = None,
                headers = None,
                request_body = None):
        """Execute a standard HTTP request.

        :param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
          `POST`, `PATCH`, `PUT`, or `DELETE`.
        :type method: :class:`str <python:str>`

        :param url: The URL to execute the request against.
        :type url: :class:`str <python:str>`

        :param parameters: URL parameters to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param headers: HTTP headers to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param request_body: The data to supply in the body of the request. Defaults to
          :obj:`None <python:None>`.
        :type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
          :class:`str <python:str>` / :class:`bytes <python:bytes>`

        :returns: The content of the HTTP response, the status code of the HTTP response,
          and the headers of the HTTP response.
        :rtype: :class:`tuple <python:tuple>` of :class:`bytes <python:bytes>`,
          :class:`int <python:int>`, and :class:`dict <python:dict>`

        :raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
          ``PATCH``, ``PUT`` or ``DELETE``
        :raises ValueError: if ``url`` is not a valid URL
        :raises ValueError: if ``headers`` is not empty and is not a
          :class:`dict <python:dict>`

        :raises HTTPTimeoutError: if the request times out
        :raises SSLError: if the request fails SSL certificate verification
        :raises WalkScoreError: *or sub-classes* for other errors returned by the API

        """
        method = validators.string(method, allow_empty = False)
        method = method.upper()
        if method not in HTTP_METHODS:
            raise ValueError('method (%s) not a recognized HTTP method' % method)

        url = validators.url(url, allow_empty = False)

        parameters = validators.dict(parameters, allow_empty = True)
        headers = validators.dict(headers, allow_empty = True)


        content, status_code, headers = self._request(method,
                                                      url,
                                                      parameters,
                                                      headers,
                                                      request_body)

        check_for_errors(status_code, content)

        return content, status_code, headers

[docs]    def close(self):
        """Closes an existing HTTP connection/session."""

        raise NotImplementedError(
            "HTTPClient subclasses must implement `close`"
        )


class RequestsClient(HTTPClient):
    """:class:`HTTPClient` for the :doc:`requests <requests:index>` library."""

    name = "requests"

    def __init__(self,
                 timeout = 80,
                 session = None,
                 **kwargs):
        super(RequestsClient, self).__init__(**kwargs)

        self._session = session
        self._timeout = timeout

    def _request(self,
                 method,
                 url,
                 parameters = None,
                 headers = None,
                 request_body = None):
        """Execute a standard HTTP request.

        :param method: The HTTP method to use for the request. Accepts `GET`, `HEAD`,
          `POST`, `PATCH`, `PUT`, or `DELETE`.
        :type method: :class:`str <python:str>`

        :param url: The URL to execute the request against.
        :type url: :class:`str <python:str>`

        :param parameters: URL parameters to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type parameters: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param headers: HTTP headers to submit with the request. Defaults to
          :obj:`None <python:None>`.
        :type headers: :class:`dict <python:dict>` / :obj:`None <python:None>`

        :param request_body: The data to supply in the body of the request. Defaults to
          :obj:`None <python:None>`.
        :type request_body: :obj:`None <python:None>` / :class:`dict <python:dict>` /
          :class:`str <python:str>` / :class:`bytes <python:bytes>`

        :returns: The content of the HTTP response, the status code of the HTTP response,
          and the headers of the HTTP response.
        :rtype: :class:`tuple <python:tuple>` of :class:`bytes <python:bytes>`,
          :class:`int <python:int>`, and :class:`dict <python:dict>`

        :raises ValueError: if ``method`` is not either ``GET``, ``HEAD``, ``POST``,
          ``PATCH``, ``PUT`` or ``DELETE``
        :raises ValueError: if ``url`` is not a valid URL
        :raises HTTPTimeoutError: if the request times out
        :raises SSLError: if the request fails SSL certificate verification
        :raises WalkScoreError: *or sub-classes* for other errors returned by the API

        """
        kwargs = {}
        if self._verify_ssl_certs:
            kwargs["verify"] = CA_BUNDLE_PATH
        else:
            kwargs["verify"] = False

        if self._proxy:
            kwargs["proxies"] = self._proxy

        if getattr(self._thread_local, "session", None) is None:
            self._thread_local.session = self._session or requests.Session()

        try:
            try:
                result = self._thread_local.session.request(method,
                                                            url,
                                                            params = parameters,
                                                            headers = headers,
                                                            data = request_body,
                                                            timeout = self._timeout,
                                                            **kwargs)
            except TypeError as error:
                raise TypeError(
                    "Warning: It looks like your installed version of the "
                    '"requests" library is not compatible with WalkScore\'s '
                    "usage thereof. (HINT: The most likely cause is that "
                    'your "requests" library is out of date. You can fix '
                    'that by running "pip install -U requests".) The '
                    "underlying error was: %s" % (error,)
                )

            # This causes the content to actually be read, which could cause
            # e.g. a socket timeout. TODO: The other fetch methods probably
            # are susceptible to the same and should be updated.
            content = result.content
            status_code = result.status_code
        except Exception as error:                                                        # pylint: disable=W0703
            # Would catch just requests.exceptions.RequestException, but can
            # also raise ValueError, RuntimeError, etc.
            WalkScoreError.from_exception(error)

        return content, status_code, result.headers

    def close(self):
        """Closes an existing HTTP connection/session."""

        if getattr(self._thread_local, "session", None) is not None:
            self._thread_local.session.close()


class UrlFetchClient(HTTPClient):
    """class:`HTTPClient` for the :doc:`urlfetch <urlfetch:index>` library."""

    name = "urlfetch"

    def __init__(self,
                 verify_ssl_certs = True,
                 proxy = None,
                 deadline = 55):
        super(UrlFetchClient, self).__init__(verify_ssl_certs = verify_ssl_certs,
                                             proxy = proxy)

        # no proxy support in urlfetch. for a patch, see:
        # https://code.google.com/p/googleappengine/issues/detail?id=544
        if proxy:
            raise ValueError(
                "No proxy support in urlfetch library. "
                "Set walkscore.default_http_client to either RequestsClient, "
                "PycurlClient, or Urllib2Client instance to use a proxy."
            )

        self._verify_ssl_certs = verify_ssl_certs

        # GAE requests time out after 60 seconds, so make sure to default
        # to 55 seconds to allow for a slow WalkScore API
        self._deadline = deadline

    def _request(self,
                 method,
                 url,
                 parameters = None,
                 headers = None,
                 request_body = None):
        try:
            result = urlfetch.request(
                url = url,
                method = method,
                params = parameters,
                headers = headers,
                # Google App Engine doesn't let us specify our own cert bundle.
                # However, that's ok because the CA bundle they use recognizes
                # api.stripe.com.
                validate_certificate = self._verify_ssl_certs,
                deadline = self._deadline,
                data = request_body,
            )
        except urlfetch.Error as error:
            if isinstance(error, urlfetch.InvalidURLError):
                raise BindingError(
                    "The WalkScore library attempted to fetch an "
                    "invalid URL (%r). This is likely due to a bug "
                    "in the WalkScore Python bindings. Please let us know "
                    "at software@insightindustry.com." % (url,)
                )
            elif isinstance(error, urlfetch.DownloadError):
                message = "There was a problem retrieving data from WalkScore."
            elif isinstance(error, urlfetch.ResponseTooLargeError):
                message = (
                    "There was a problem receiving all of your data from "
                    "WalkScore.  This is likely due to a bug in WalkScore. "
                )
            else:
                message = (
                    "Unexpected error communicating with WalkScore. If this "
                    "problem persists, let us know at software@insightindustry.com."
                )

            raise InternalAPIError(message)

        return result.content, result.status_code, result.headers

    def close(self):
        pass


class PycurlClient(HTTPClient):
    """class:`HTTPClient` for the `pycurl <http://pycurl.io/docs/latest/index.html>`_
    library.
    """

    name = "pycurl"

    def __init__(self,
                 verify_ssl_certs = True,
                 proxy = None):
        super(PycurlClient, self).__init__(verify_ssl_certs = verify_ssl_certs,
                                           proxy = proxy)

        # Initialize this within the object so that we can reuse connections.
        self._curl = pycurl.Curl()

        # need to urlparse the proxy, since PyCurl
        # consumes the proxy url in small pieces
        if self._proxy:
            # now that we have the parser, get the proxy url pieces
            for scheme in self._proxy:
                self._proxy[scheme] = urlparse(self._proxy[scheme])

    def parse_headers(self, data):                                                        # pylint: disable=R0201
        """Parse headers into a :class:`dict <python:dict>`

        :param data: A string-like object with header data.
        :type data: :class:`bytes <python:bytes>` / :class:`str <python:str>`

        :returns: Dictionary of HTTP headers.
        :rtype: :class:`dict <python:dict>`
        """
        if "\r\n" not in data:
            return {}
        raw_headers = data.split("\r\n", 1)[1]
        headers = email.message_from_string(raw_headers)

        return dict((k.lower(), v) for k, v in six.iteritems(dict(headers)))

    def _request(self,
                 method,
                 url,
                 parameters = None,
                 headers = None,
                 request_body = None):
        if isinstance(request_body, dict):
            request_body = json.dumps(request_body)

        b = io.BytesIO()
        rheaders = io.BytesIO()

        # Pycurl's design is a little weird: although we set per-request
        # options on this object, it's also capable of maintaining established
        # connections. Here we call reset() between uses to make sure it's in a
        # pristine state, but notably reset() doesn't reset connections, so we
        # still get to take advantage of those by virtue of re-using the same
        # object.
        self._curl.reset()

        proxy = self._get_proxy(url)
        if proxy:
            if proxy.hostname:
                self._curl.setopt(pycurl.PROXY, proxy.hostname)
            if proxy.port:
                self._curl.setopt(pycurl.PROXYPORT, proxy.port)
            if proxy.username or proxy.password:
                self._curl.setopt(
                    pycurl.PROXYUSERPWD,
                    "%s:%s" % (proxy.username, proxy.password),
                )

        if method == "GET":
            self._curl.setopt(pycurl.HTTPGET, 1)
        elif method == 'HEAD':
            self._curl.setopt(pycurl.NOBODY, 1)
        elif method == "POST":
            self._curl.setopt(pycurl.POST, 1)
            self._curl.setopt(pycurl.POSTFIELDS, request_body)
        elif method == 'PUT':
            self._curl.setopt(pycurl.CUSTOMREQUEST, 'PUT')
            self._curl.setopt(pycurl.POSTFIELDS, request_body)
        elif method == 'PATCH':
            self._curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
            self._curl.setopt(pycurl.POSTFIELDS, request_body)
        else:
            self._curl.setopt(pycurl.CUSTOMREQUEST, method.upper())

        # pycurl doesn't like unicode URLs
        if parameters:
            parameter_string = base_urllib.urlencode(parameters)

            url += '?' + parameter_string

        url = to_utf8(url)
        self._curl.setopt(pycurl.URL, url)

        self._curl.setopt(pycurl.WRITEFUNCTION, b.write)
        self._curl.setopt(pycurl.HEADERFUNCTION, rheaders.write)
        self._curl.setopt(pycurl.NOSIGNAL, 1)
        self._curl.setopt(pycurl.CONNECTTIMEOUT, 30)
        self._curl.setopt(pycurl.TIMEOUT, 80)

        if headers:
            self._curl.setopt(
                pycurl.HTTPHEADER,
                ["%s: %s" % (k, v) for k, v in six.iteritems(dict(headers))],
            )

        if self._verify_ssl_certs:
            self._curl.setopt(pycurl.CAINFO, CA_BUNDLE_PATH)
        else:
            self._curl.setopt(pycurl.SSL_VERIFYHOST, False)

        try:
            self._curl.perform()
        except pycurl.error as error:
            self._handle_request_error(error)

        rbody = b.getvalue().decode("utf-8")
        rcode = self._curl.getinfo(pycurl.RESPONSE_CODE)
        headers = self.parse_headers(rheaders.getvalue().decode("utf-8"))

        return rbody, rcode, headers

    @classmethod
    def _handle_request_error(cls, error):
        if error.args[0] == pycurl.E_OPERATION_TIMEOUTED:
            raise HTTPTimeoutError('Could not connect to the WalkScore API. '
                                   'Please check your internet connection and try again. ')
        elif error.args[0] == [pycurl.E_COULDNT_CONNECT,
                               pycurl.E_COULDNT_RESOLVE_HOST]:
            raise HTTPConnectionError("Could not connect to WalkScore.  Please check "
                                      "your internet connection and try again.")
        elif error.args[0] in [pycurl.E_SSL_CACERT,
                               pycurl.E_SSL_PEER_CERTIFICATE]:
            raise SSLError("Could not verify WalkScore's SSL certificate.  Please make "
                           "sure that your network is not intercepting certificates.")
        else:
            raise HTTPConnectionError(
                "Unexpected error communicating with the WalkScore API. If this "
                "problem persists, let us know at software@insightindustry.com."
            )

    def _get_proxy(self, url):
        if self._proxy:
            proxy = self._proxy
            scheme = url.split(":")[0] if url else None
            if scheme:
                if scheme in proxy:
                    return proxy[scheme]

                scheme = scheme[0:-1]

                if scheme in proxy:
                    return proxy[scheme]

        return None

    def close(self):
        pass


class Urllib2Client(HTTPClient):
    """class:`HTTPClient` for the :doc:`urllib2` package.
    """

    name = "urllib.request"

    def __init__(self,
                 verify_ssl_certs = True,
                 proxy = None):
        super(Urllib2Client, self).__init__(verify_ssl_certs = verify_ssl_certs,
                                            proxy = proxy)

        # prepare and cache proxy tied opener here
        self._opener = None
        if self._proxy:
            proxy = urllib.request.ProxyHandler(self._proxy)
            self._opener = urllib.request.build_opener(proxy)

    def _request(self,
                 method,
                 url,
                 parameters = None,
                 headers = None,
                 request_body = None):
        request_body = to_utf8(request_body)

        if parameters:
            parameter_string = base_urllib.urlencode(parameters)
            url += '?' + parameter_string

        request = urllib.request.Request(url, request_body, headers)

        if method not in ("GET", "POST"):
            request.get_method = method

        try:
            # use the custom proxy tied opener, if any.
            # otherwise, fall to the default urllib opener.
            if self._opener:
                response = self._opener.open(request)
            else:
                response = urllib.request.urlopen(request)

            rbody = response.read()
            rcode = response.code
            response_headers = dict(response.info())
            if hasattr(response, 'close'):
                response.close()
        except urllib.error.HTTPError as error:
            rcode = error.code
            rbody = error.read()
            response_headers = dict(error.info())
            if hasattr(error, 'close'):
                error.close()
        except (urllib.error.URLError, ValueError) as error:
            WalkScoreError.from_exception(error)

        response_headers = dict((k.lower(), v) for k, v
                                in six.iteritems(dict(response_headers)))

        return rbody, rcode, response_headers

    def close(self):
        pass