summaryrefslogtreecommitdiffstats
path: root/chromium/third_party/webpagereplay/httparchive.py
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/webpagereplay/httparchive.py')
-rwxr-xr-xchromium/third_party/webpagereplay/httparchive.py177
1 files changed, 104 insertions, 73 deletions
diff --git a/chromium/third_party/webpagereplay/httparchive.py b/chromium/third_party/webpagereplay/httparchive.py
index e0c468234d4..3e710634f1b 100755
--- a/chromium/third_party/webpagereplay/httparchive.py
+++ b/chromium/third_party/webpagereplay/httparchive.py
@@ -22,13 +22,13 @@ To view the content of all URLs from example.com:
$ ./httparchive.py cat --host example.com archive.wpr
To view the content of a particular URL:
- $ ./httparchive.py cat --host www.example.com --path /foo archive.wpr
+ $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
To view the content of all URLs:
$ ./httparchive.py cat archive.wpr
To edit a particular URL:
- $ ./httparchive.py edit --host www.example.com --path /foo archive.wpr
+ $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
To print statistics of an archive:
$ ./httparchive.py stats archive.wpr
@@ -40,6 +40,7 @@ To merge multiple archives
$ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
"""
+import calendar
import difflib
import email.utils
import httplib
@@ -57,8 +58,6 @@ import time
import urlparse
from collections import defaultdict
-import platformsettings
-
def LogRunTime(fn):
"""Annotation which logs the run time of the function."""
@@ -86,7 +85,6 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
Persist(filename)
Attributes:
- server_rtt: dict of {hostname, server rtt in milliseconds}
responses_by_host: dict of {hostname, {request: response}}. This must remain
in sync with the underlying dict of self. It is used as an optimization
so that get_requests() doesn't have to linearly search all requests in
@@ -94,7 +92,6 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
"""
def __init__(self):
- self.server_rtt = {}
self.responses_by_host = defaultdict(dict)
def __setstate__(self, state):
@@ -127,21 +124,6 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
super(HttpArchive, self).__delitem__(key)
del self.responses_by_host[key.host][key]
- def get_server_rtt(self, server):
- """Retrieves the round trip time (rtt) to the server
-
- Args:
- server: the hostname of the server
-
- Returns:
- round trip time to the server in seconds, or 0 if unavailable
- """
- if server not in self.server_rtt:
- # TODO(tonyg): Pinging inline with the request causes timeouts. Need to
- # find a way to restore this functionality.
- self.server_rtt[server] = 0 # platform_settings.ping_rtt(server)
- return self.server_rtt[server]
-
def get(self, request, default=None):
"""Return the archived response for a given request.
@@ -185,8 +167,8 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
def get_conditional_status(self, request, response):
status = 200
last_modified = email.utils.parsedate(
- response.get_header_case_insensitive('last-modified'))
- response_etag = response.get_header_case_insensitive('etag')
+ response.update_date(response.get_header('last-modified')))
+ response_etag = response.get_header('etag')
is_get_or_head = request.command.upper() in ('GET', 'HEAD')
match_value = request.headers.get('if-match', None)
@@ -233,25 +215,27 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
return True
return False
- def get_requests(self, command=None, host=None, path=None, is_ssl=None,
+ def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
use_query=True):
"""Return a list of requests that match the given args."""
if host:
return [r for r in self.responses_by_host[host]
- if r.matches(command, None, path, is_ssl, use_query=use_query)]
+ if r.matches(command, None, full_path, is_ssl,
+ use_query=use_query)]
else:
return [r for r in self
- if r.matches(command, host, path, is_ssl, use_query=use_query)]
+ if r.matches(command, host, full_path, is_ssl,
+ use_query=use_query)]
- def ls(self, command=None, host=None, path=None):
+ def ls(self, command=None, host=None, full_path=None):
"""List all URLs that match given params."""
return ''.join(sorted(
- '%s\n' % r for r in self.get_requests(command, host, path)))
+ '%s\n' % r for r in self.get_requests(command, host, full_path)))
- def cat(self, command=None, host=None, path=None):
+ def cat(self, command=None, host=None, full_path=None):
"""Print the contents of all URLs that match given params."""
out = StringIO.StringIO()
- for request in self.get_requests(command, host, path):
+ for request in self.get_requests(command, host, full_path):
print >>out, str(request)
print >>out, 'Untrimmed request headers:'
for k in request.headers:
@@ -281,9 +265,9 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
print >>out, '=' * 70
return out.getvalue()
- def stats(self, command=None, host=None, path=None):
+ def stats(self, command=None, host=None, full_path=None):
"""Print stats about the archive for all URLs that match given params."""
- matching_requests = self.get_requests(command, host, path)
+ matching_requests = self.get_requests(command, host, full_path)
if not matching_requests:
print 'Failed to find any requests matching given command, host, path.'
return
@@ -338,21 +322,22 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
self[r] = http_archive_other[r]
self.Persist('%s' % merged_archive)
- def edit(self, command=None, host=None, path=None):
+ def edit(self, command=None, host=None, full_path=None):
"""Edits the single request which matches given params."""
editor = os.getenv('EDITOR')
if not editor:
print 'You must set the EDITOR environmental variable.'
return
- matching_requests = self.get_requests(command, host, path)
+ matching_requests = self.get_requests(command, host, full_path)
if not matching_requests:
- print 'Failed to find any requests matching given command, host, path.'
+ print ('Failed to find any requests matching given command, host, '
+ 'full_path.')
return
if len(matching_requests) > 1:
print 'Found multiple matching requests. Please refine.'
- print self.ls(command, host, path)
+ print self.ls(command, host, full_path)
response = self[matching_requests[0]]
tmp_file = tempfile.NamedTemporaryFile(delete=False)
@@ -369,16 +354,16 @@ class HttpArchive(dict, persistentmixin.PersistentMixin):
request: an ArchivedHttpRequest
use_path: If True, closest matching request's path component must match.
(Note: this refers to the 'path' component within the URL, not the
- query string component.)
- If use_path=False, candidate will NOT match in example below
+ 'full path' which includes the query string component.)
+ If use_path=True, candidate will NOT match in example below
e.g. request = GET www.test.com/path?aaa
candidate = GET www.test.com/diffpath?aaa
Returns:
If a close match is found, return the instance of ArchivedHttpRequest.
Otherwise, return None.
"""
- path = request.path if use_path else None
- requests = self.get_requests(request.command, request.host, path,
+ full_path = request.full_path if use_path else None
+ requests = self.get_requests(request.command, request.host, full_path,
is_ssl=request.is_ssl, use_query=not use_path)
if not requests:
@@ -444,21 +429,23 @@ class ArchivedHttpRequest(object):
'if-none-match', 'if-match',
'if-modified-since', 'if-unmodified-since']
- def __init__(self, command, host, path, request_body, headers, is_ssl=False):
+ def __init__(self, command, host, full_path, request_body, headers,
+ is_ssl=False):
"""Initialize an ArchivedHttpRequest.
Args:
command: a string (e.g. 'GET' or 'POST').
host: a host name (e.g. 'www.google.com').
- path: a request path (e.g. '/search?q=dogs').
+ full_path: a request path. Includes everything after the host & port in
+ the URL (e.g. '/search?q=dogs').
request_body: a request body string for a POST or None.
headers: {key: value, ...} where key and value are strings.
is_ssl: a boolean which is True iff request is make via SSL.
"""
self.command = command
self.host = host
- self.path = path
- self.path_without_query = urlparse.urlparse(path).path
+ self.full_path = full_path
+ self.path = urlparse.urlparse(full_path).path if full_path else None
self.request_body = request_body
self.headers = headers
self.is_ssl = is_ssl
@@ -468,10 +455,10 @@ class ArchivedHttpRequest(object):
def __str__(self):
scheme = 'https' if self.is_ssl else 'http'
return '%s %s://%s%s %s' % (
- self.command, scheme, self.host, self.path, self.trimmed_headers)
+ self.command, scheme, self.host, self.full_path, self.trimmed_headers)
def __repr__(self):
- return repr((self.command, self.host, self.path, self.request_body,
+ return repr((self.command, self.host, self.full_path, self.request_body,
self.trimmed_headers, self.is_ssl))
def __hash__(self):
@@ -500,11 +487,19 @@ class ArchivedHttpRequest(object):
raise HttpArchiveException(
'Archived HTTP request is missing "headers". The HTTP archive is'
' likely from a previous version and must be re-recorded.')
+ if 'path' in state:
+ # before, 'path' and 'path_without_query' were used and 'path' was
+ # pickled. Now, 'path' has been renamed to 'full_path' and
+ # 'path_without_query' has been renamed to 'path'. 'full_path' is
+ # pickled, but 'path' is not. If we see 'path' here it means we are
+ # dealing with an older archive.
+ state['full_path'] = state['path']
+ del state['path']
state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
if 'is_ssl' not in state:
state['is_ssl'] = False
self.__dict__.update(state)
- self.path_without_query = urlparse.urlparse(self.path).path
+ self.path = urlparse.urlparse(self.full_path).path
self.formatted_request = self._GetFormattedRequest()
def __getstate__(self):
@@ -515,7 +510,7 @@ class ArchivedHttpRequest(object):
"""
state = self.__dict__.copy()
del state['trimmed_headers']
- del state['path_without_query']
+ del state['path']
del state['formatted_request']
return state
@@ -526,7 +521,7 @@ class ArchivedHttpRequest(object):
A string consisting of the request. Example:
'GET www.example.com/path\nHeader-Key: header value\n'
"""
- parts = ['%s %s%s\n' % (self.command, self.host, self.path)]
+ parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
if self.request_body:
parts.append('%s\n' % self.request_body)
for k, v in self.trimmed_headers:
@@ -534,14 +529,14 @@ class ArchivedHttpRequest(object):
parts.append('%s: %s\n' % (k, v))
return ''.join(parts)
- def matches(self, command=None, host=None, path_with_query=None, is_ssl=None,
+ def matches(self, command=None, host=None, full_path=None, is_ssl=None,
use_query=True):
"""Returns true iff the request matches all parameters.
Args:
command: a string (e.g. 'GET' or 'POST').
host: a host name (e.g. 'www.google.com').
- path_with_query: a request path with query string (e.g. '/search?q=dogs')
+ full_path: a request path with query string (e.g. '/search?q=dogs')
is_ssl: whether the request is secure.
use_query:
If use_query is True, request matching uses both the hierarchical path
@@ -563,12 +558,12 @@ class ArchivedHttpRequest(object):
return False
if host is not None and host != self.host:
return False
- if path_with_query is None:
+ if full_path is None:
return True
if use_query:
- return path_with_query == self.path
+ return full_path == self.full_path
else:
- return self.path_without_query == urlparse.urlparse(path_with_query).path
+ return self.path == urlparse.urlparse(full_path).path
@classmethod
def _TrimHeaders(cls, headers):
@@ -578,6 +573,7 @@ class ArchivedHttpRequest(object):
- accept: Causes problems with www.bing.com. During record, CSS is fetched
with *. During replay, it's text/css.
- accept-charset, accept-language, referer: vary between clients.
+ - cache-control: sometimes sent from Chrome with 'max-age=0' as value.
- connection, method, scheme, url, version: Cause problems with spdy.
- cookie: Extremely sensitive to request/response order.
- keep-alive: Not supported by Web Page Replay.
@@ -604,7 +600,7 @@ class ArchivedHttpRequest(object):
if headers['accept-encoding'].endswith(','):
headers['accept-encoding'] = headers['accept-encoding'][:-1]
undesirable_keys = [
- 'accept', 'accept-charset', 'accept-language',
+ 'accept', 'accept-charset', 'accept-language', 'cache-control',
'connection', 'cookie', 'keep-alive', 'method',
'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
'x-chrome-variations']
@@ -622,7 +618,7 @@ class ArchivedHttpRequest(object):
stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
if k.lower() not in self.CONDITIONAL_HEADERS)
return ArchivedHttpRequest(
- self.command, self.host, self.path, self.request_body,
+ self.command, self.host, self.full_path, self.request_body,
stripped_headers, self.is_ssl)
class ArchivedHttpResponse(object):
@@ -651,8 +647,14 @@ class ArchivedHttpResponse(object):
Concatenating the chunks gives the complete contents
(i.e. the chunks do not have any lengths or delimiters).
Do not include the final, zero-length chunk that marks the end.
- delays: dict of (ms) delays before "headers" and "data". For example,
- {'headers': 50, 'data': [0, 10, 10]}
+ delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
+ e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
+ connect - The time to connect to the server.
+ Each resource has a value because Replay's record mode captures it.
+ This includes the time for the SYN and SYN/ACK (1 rtt).
+ headers - The time elapsed between the TCP connect and the headers.
+ This typically includes all the server-time to generate a response.
+ data - If the response is chunked, these are the times for each chunk.
"""
self.version = version
self.status = status
@@ -667,6 +669,7 @@ class ArchivedHttpResponse(object):
expected_num_delays = len(self.response_data)
if not self.delays:
self.delays = {
+ 'connect': 0,
'headers': 0,
'data': [0] * expected_num_delays
}
@@ -697,6 +700,7 @@ class ArchivedHttpResponse(object):
"""
if 'server_delays' in state:
state['delays'] = {
+ 'connect': 0,
'headers': 0,
'data': state['server_delays']
}
@@ -708,15 +712,9 @@ class ArchivedHttpResponse(object):
def get_header(self, key, default=None):
for k, v in self.headers:
- if key == k:
- return v
- return default
-
- def get_header_case_insensitive(self, key):
- for k, v in self.headers:
if key.lower() == k.lower():
return v
- return None
+ return default
def set_header(self, key, value):
for i, (k, v) in enumerate(self.headers):
@@ -727,10 +725,41 @@ class ArchivedHttpResponse(object):
def remove_header(self, key):
for i, (k, v) in enumerate(self.headers):
- if key == k:
+ if key.lower() == k.lower():
self.headers.pop(i)
return
+ def _get_epoch_seconds(self, date_str):
+ """Return the epoch seconds of a date header.
+
+ Args:
+ date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
+ Returns:
+ epoch seconds as a float
+ """
+ date_tuple = email.utils.parsedate(date_str)
+ if date_tuple:
+ return calendar.timegm(date_tuple)
+ return None
+
+ def update_date(self, date_str, now=None):
+ """Return an updated date based on its delta from the "Date" header.
+
+ For example, if |date_str| is one week later than the "Date" header,
+ then the returned date string is one week later than the current date.
+
+ Args:
+ date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
+ Returns:
+ a date string
+ """
+ date_seconds = self._get_epoch_seconds(self.get_header('date'))
+ header_seconds = self._get_epoch_seconds(date_str)
+ if date_seconds and header_seconds:
+ updated_seconds = header_seconds + (now or time.time()) - date_seconds
+ return email.utils.formatdate(updated_seconds, usegmt=True)
+ return date_str
+
def is_gzip(self):
return self.get_header('content-encoding') == 'gzip'
@@ -748,7 +777,8 @@ class ArchivedHttpResponse(object):
content_type = self.get_header('content-type')
if (not content_type or
not (content_type.startswith('text/') or
- content_type == 'application/x-javascript')):
+ content_type == 'application/x-javascript' or
+ content_type.startswith('application/json'))):
return None
if self.is_compressed():
uncompressed_chunks = httpzlib.uncompress_chunks(
@@ -794,6 +824,7 @@ class ArchivedHttpResponse(object):
Args:
delays_text: JSON encoded text such as the following:
{
+ connect: 80,
headers: 80,
data: [6, 55, 0]
}
@@ -854,10 +885,10 @@ def main():
action='store',
type='string',
help='Only show URLs matching this host.')
- option_parser.add_option('-p', '--path', default=None,
+ option_parser.add_option('-p', '--full_path', default=None,
action='store',
type='string',
- help='Only show URLs matching this path.')
+ help='Only show URLs matching this full path.')
option_parser.add_option('-f', '--merged_file', default=None,
action='store',
type='string',
@@ -878,18 +909,18 @@ def main():
http_archive = HttpArchive.Load(replay_file)
if command == 'ls':
- print http_archive.ls(options.command, options.host, options.path)
+ print http_archive.ls(options.command, options.host, options.full_path)
elif command == 'cat':
- print http_archive.cat(options.command, options.host, options.path)
+ print http_archive.cat(options.command, options.host, options.full_path)
elif command == 'stats':
- print http_archive.stats(options.command, options.host, options.path)
+ print http_archive.stats(options.command, options.host, options.full_path)
elif command == 'merge':
if not options.merged_file:
print 'Error: Must specify a merged file name (use --merged_file)'
return
http_archive.merge(options.merged_file, args[2:])
elif command == 'edit':
- http_archive.edit(options.command, options.host, options.path)
+ http_archive.edit(options.command, options.host, options.full_path)
http_archive.Persist(replay_file)
else:
option_parser.error('Unknown command "%s"' % command)