diff options
Diffstat (limited to 'chromium/third_party/webpagereplay/httparchive.py')
-rwxr-xr-x | chromium/third_party/webpagereplay/httparchive.py | 177 |
1 files changed, 104 insertions, 73 deletions
diff --git a/chromium/third_party/webpagereplay/httparchive.py b/chromium/third_party/webpagereplay/httparchive.py index e0c468234d4..3e710634f1b 100755 --- a/chromium/third_party/webpagereplay/httparchive.py +++ b/chromium/third_party/webpagereplay/httparchive.py @@ -22,13 +22,13 @@ To view the content of all URLs from example.com: $ ./httparchive.py cat --host example.com archive.wpr To view the content of a particular URL: - $ ./httparchive.py cat --host www.example.com --path /foo archive.wpr + $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr To view the content of all URLs: $ ./httparchive.py cat archive.wpr To edit a particular URL: - $ ./httparchive.py edit --host www.example.com --path /foo archive.wpr + $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr To print statistics of an archive: $ ./httparchive.py stats archive.wpr @@ -40,6 +40,7 @@ To merge multiple archives $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ... """ +import calendar import difflib import email.utils import httplib @@ -57,8 +58,6 @@ import time import urlparse from collections import defaultdict -import platformsettings - def LogRunTime(fn): """Annotation which logs the run time of the function.""" @@ -86,7 +85,6 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): Persist(filename) Attributes: - server_rtt: dict of {hostname, server rtt in milliseconds} responses_by_host: dict of {hostname, {request: response}}. This must remain in sync with the underlying dict of self. It is used as an optimization so that get_requests() doesn't have to linearly search all requests in @@ -94,7 +92,6 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): """ def __init__(self): - self.server_rtt = {} self.responses_by_host = defaultdict(dict) def __setstate__(self, state): @@ -127,21 +124,6 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): super(HttpArchive, self).__delitem__(key) del self.responses_by_host[key.host][key] - def get_server_rtt(self, server): - """Retrieves the round trip time (rtt) to the server - - Args: - server: the hostname of the server - - Returns: - round trip time to the server in seconds, or 0 if unavailable - """ - if server not in self.server_rtt: - # TODO(tonyg): Pinging inline with the request causes timeouts. Need to - # find a way to restore this functionality. - self.server_rtt[server] = 0 # platform_settings.ping_rtt(server) - return self.server_rtt[server] - def get(self, request, default=None): """Return the archived response for a given request. @@ -185,8 +167,8 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): def get_conditional_status(self, request, response): status = 200 last_modified = email.utils.parsedate( - response.get_header_case_insensitive('last-modified')) - response_etag = response.get_header_case_insensitive('etag') + response.update_date(response.get_header('last-modified'))) + response_etag = response.get_header('etag') is_get_or_head = request.command.upper() in ('GET', 'HEAD') match_value = request.headers.get('if-match', None) @@ -233,25 +215,27 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): return True return False - def get_requests(self, command=None, host=None, path=None, is_ssl=None, + def get_requests(self, command=None, host=None, full_path=None, is_ssl=None, use_query=True): """Return a list of requests that match the given args.""" if host: return [r for r in self.responses_by_host[host] - if r.matches(command, None, path, is_ssl, use_query=use_query)] + if r.matches(command, None, full_path, is_ssl, + use_query=use_query)] else: return [r for r in self - if r.matches(command, host, path, is_ssl, use_query=use_query)] + if r.matches(command, host, full_path, is_ssl, + use_query=use_query)] - def ls(self, command=None, host=None, path=None): + def ls(self, command=None, host=None, full_path=None): """List all URLs that match given params.""" return ''.join(sorted( - '%s\n' % r for r in self.get_requests(command, host, path))) + '%s\n' % r for r in self.get_requests(command, host, full_path))) - def cat(self, command=None, host=None, path=None): + def cat(self, command=None, host=None, full_path=None): """Print the contents of all URLs that match given params.""" out = StringIO.StringIO() - for request in self.get_requests(command, host, path): + for request in self.get_requests(command, host, full_path): print >>out, str(request) print >>out, 'Untrimmed request headers:' for k in request.headers: @@ -281,9 +265,9 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): print >>out, '=' * 70 return out.getvalue() - def stats(self, command=None, host=None, path=None): + def stats(self, command=None, host=None, full_path=None): """Print stats about the archive for all URLs that match given params.""" - matching_requests = self.get_requests(command, host, path) + matching_requests = self.get_requests(command, host, full_path) if not matching_requests: print 'Failed to find any requests matching given command, host, path.' return @@ -338,21 +322,22 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): self[r] = http_archive_other[r] self.Persist('%s' % merged_archive) - def edit(self, command=None, host=None, path=None): + def edit(self, command=None, host=None, full_path=None): """Edits the single request which matches given params.""" editor = os.getenv('EDITOR') if not editor: print 'You must set the EDITOR environmental variable.' return - matching_requests = self.get_requests(command, host, path) + matching_requests = self.get_requests(command, host, full_path) if not matching_requests: - print 'Failed to find any requests matching given command, host, path.' + print ('Failed to find any requests matching given command, host, ' + 'full_path.') return if len(matching_requests) > 1: print 'Found multiple matching requests. Please refine.' - print self.ls(command, host, path) + print self.ls(command, host, full_path) response = self[matching_requests[0]] tmp_file = tempfile.NamedTemporaryFile(delete=False) @@ -369,16 +354,16 @@ class HttpArchive(dict, persistentmixin.PersistentMixin): request: an ArchivedHttpRequest use_path: If True, closest matching request's path component must match. (Note: this refers to the 'path' component within the URL, not the - query string component.) - If use_path=False, candidate will NOT match in example below + 'full path' which includes the query string component.) + If use_path=True, candidate will NOT match in example below e.g. request = GET www.test.com/path?aaa candidate = GET www.test.com/diffpath?aaa Returns: If a close match is found, return the instance of ArchivedHttpRequest. Otherwise, return None. """ - path = request.path if use_path else None - requests = self.get_requests(request.command, request.host, path, + full_path = request.full_path if use_path else None + requests = self.get_requests(request.command, request.host, full_path, is_ssl=request.is_ssl, use_query=not use_path) if not requests: @@ -444,21 +429,23 @@ class ArchivedHttpRequest(object): 'if-none-match', 'if-match', 'if-modified-since', 'if-unmodified-since'] - def __init__(self, command, host, path, request_body, headers, is_ssl=False): + def __init__(self, command, host, full_path, request_body, headers, + is_ssl=False): """Initialize an ArchivedHttpRequest. Args: command: a string (e.g. 'GET' or 'POST'). host: a host name (e.g. 'www.google.com'). - path: a request path (e.g. '/search?q=dogs'). + full_path: a request path. Includes everything after the host & port in + the URL (e.g. '/search?q=dogs'). request_body: a request body string for a POST or None. headers: {key: value, ...} where key and value are strings. is_ssl: a boolean which is True iff request is make via SSL. """ self.command = command self.host = host - self.path = path - self.path_without_query = urlparse.urlparse(path).path + self.full_path = full_path + self.path = urlparse.urlparse(full_path).path if full_path else None self.request_body = request_body self.headers = headers self.is_ssl = is_ssl @@ -468,10 +455,10 @@ class ArchivedHttpRequest(object): def __str__(self): scheme = 'https' if self.is_ssl else 'http' return '%s %s://%s%s %s' % ( - self.command, scheme, self.host, self.path, self.trimmed_headers) + self.command, scheme, self.host, self.full_path, self.trimmed_headers) def __repr__(self): - return repr((self.command, self.host, self.path, self.request_body, + return repr((self.command, self.host, self.full_path, self.request_body, self.trimmed_headers, self.is_ssl)) def __hash__(self): @@ -500,11 +487,19 @@ class ArchivedHttpRequest(object): raise HttpArchiveException( 'Archived HTTP request is missing "headers". The HTTP archive is' ' likely from a previous version and must be re-recorded.') + if 'path' in state: + # before, 'path' and 'path_without_query' were used and 'path' was + # pickled. Now, 'path' has been renamed to 'full_path' and + # 'path_without_query' has been renamed to 'path'. 'full_path' is + # pickled, but 'path' is not. If we see 'path' here it means we are + # dealing with an older archive. + state['full_path'] = state['path'] + del state['path'] state['trimmed_headers'] = self._TrimHeaders(dict(state['headers'])) if 'is_ssl' not in state: state['is_ssl'] = False self.__dict__.update(state) - self.path_without_query = urlparse.urlparse(self.path).path + self.path = urlparse.urlparse(self.full_path).path self.formatted_request = self._GetFormattedRequest() def __getstate__(self): @@ -515,7 +510,7 @@ class ArchivedHttpRequest(object): """ state = self.__dict__.copy() del state['trimmed_headers'] - del state['path_without_query'] + del state['path'] del state['formatted_request'] return state @@ -526,7 +521,7 @@ class ArchivedHttpRequest(object): A string consisting of the request. Example: 'GET www.example.com/path\nHeader-Key: header value\n' """ - parts = ['%s %s%s\n' % (self.command, self.host, self.path)] + parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)] if self.request_body: parts.append('%s\n' % self.request_body) for k, v in self.trimmed_headers: @@ -534,14 +529,14 @@ class ArchivedHttpRequest(object): parts.append('%s: %s\n' % (k, v)) return ''.join(parts) - def matches(self, command=None, host=None, path_with_query=None, is_ssl=None, + def matches(self, command=None, host=None, full_path=None, is_ssl=None, use_query=True): """Returns true iff the request matches all parameters. Args: command: a string (e.g. 'GET' or 'POST'). host: a host name (e.g. 'www.google.com'). - path_with_query: a request path with query string (e.g. '/search?q=dogs') + full_path: a request path with query string (e.g. '/search?q=dogs') is_ssl: whether the request is secure. use_query: If use_query is True, request matching uses both the hierarchical path @@ -563,12 +558,12 @@ class ArchivedHttpRequest(object): return False if host is not None and host != self.host: return False - if path_with_query is None: + if full_path is None: return True if use_query: - return path_with_query == self.path + return full_path == self.full_path else: - return self.path_without_query == urlparse.urlparse(path_with_query).path + return self.path == urlparse.urlparse(full_path).path @classmethod def _TrimHeaders(cls, headers): @@ -578,6 +573,7 @@ class ArchivedHttpRequest(object): - accept: Causes problems with www.bing.com. During record, CSS is fetched with *. During replay, it's text/css. - accept-charset, accept-language, referer: vary between clients. + - cache-control: sometimes sent from Chrome with 'max-age=0' as value. - connection, method, scheme, url, version: Cause problems with spdy. - cookie: Extremely sensitive to request/response order. - keep-alive: Not supported by Web Page Replay. @@ -604,7 +600,7 @@ class ArchivedHttpRequest(object): if headers['accept-encoding'].endswith(','): headers['accept-encoding'] = headers['accept-encoding'][:-1] undesirable_keys = [ - 'accept', 'accept-charset', 'accept-language', + 'accept', 'accept-charset', 'accept-language', 'cache-control', 'connection', 'cookie', 'keep-alive', 'method', 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection', 'x-chrome-variations'] @@ -622,7 +618,7 @@ class ArchivedHttpRequest(object): stripped_headers = dict((k, v) for k, v in self.headers.iteritems() if k.lower() not in self.CONDITIONAL_HEADERS) return ArchivedHttpRequest( - self.command, self.host, self.path, self.request_body, + self.command, self.host, self.full_path, self.request_body, stripped_headers, self.is_ssl) class ArchivedHttpResponse(object): @@ -651,8 +647,14 @@ class ArchivedHttpResponse(object): Concatenating the chunks gives the complete contents (i.e. the chunks do not have any lengths or delimiters). Do not include the final, zero-length chunk that marks the end. - delays: dict of (ms) delays before "headers" and "data". For example, - {'headers': 50, 'data': [0, 10, 10]} + delays: dict of (ms) delays for 'connect', 'headers' and 'data'. + e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]} + connect - The time to connect to the server. + Each resource has a value because Replay's record mode captures it. + This includes the time for the SYN and SYN/ACK (1 rtt). + headers - The time elapsed between the TCP connect and the headers. + This typically includes all the server-time to generate a response. + data - If the response is chunked, these are the times for each chunk. """ self.version = version self.status = status @@ -667,6 +669,7 @@ class ArchivedHttpResponse(object): expected_num_delays = len(self.response_data) if not self.delays: self.delays = { + 'connect': 0, 'headers': 0, 'data': [0] * expected_num_delays } @@ -697,6 +700,7 @@ class ArchivedHttpResponse(object): """ if 'server_delays' in state: state['delays'] = { + 'connect': 0, 'headers': 0, 'data': state['server_delays'] } @@ -708,15 +712,9 @@ class ArchivedHttpResponse(object): def get_header(self, key, default=None): for k, v in self.headers: - if key == k: - return v - return default - - def get_header_case_insensitive(self, key): - for k, v in self.headers: if key.lower() == k.lower(): return v - return None + return default def set_header(self, key, value): for i, (k, v) in enumerate(self.headers): @@ -727,10 +725,41 @@ class ArchivedHttpResponse(object): def remove_header(self, key): for i, (k, v) in enumerate(self.headers): - if key == k: + if key.lower() == k.lower(): self.headers.pop(i) return + def _get_epoch_seconds(self, date_str): + """Return the epoch seconds of a date header. + + Args: + date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") + Returns: + epoch seconds as a float + """ + date_tuple = email.utils.parsedate(date_str) + if date_tuple: + return calendar.timegm(date_tuple) + return None + + def update_date(self, date_str, now=None): + """Return an updated date based on its delta from the "Date" header. + + For example, if |date_str| is one week later than the "Date" header, + then the returned date string is one week later than the current date. + + Args: + date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") + Returns: + a date string + """ + date_seconds = self._get_epoch_seconds(self.get_header('date')) + header_seconds = self._get_epoch_seconds(date_str) + if date_seconds and header_seconds: + updated_seconds = header_seconds + (now or time.time()) - date_seconds + return email.utils.formatdate(updated_seconds, usegmt=True) + return date_str + def is_gzip(self): return self.get_header('content-encoding') == 'gzip' @@ -748,7 +777,8 @@ class ArchivedHttpResponse(object): content_type = self.get_header('content-type') if (not content_type or not (content_type.startswith('text/') or - content_type == 'application/x-javascript')): + content_type == 'application/x-javascript' or + content_type.startswith('application/json'))): return None if self.is_compressed(): uncompressed_chunks = httpzlib.uncompress_chunks( @@ -794,6 +824,7 @@ class ArchivedHttpResponse(object): Args: delays_text: JSON encoded text such as the following: { + connect: 80, headers: 80, data: [6, 55, 0] } @@ -854,10 +885,10 @@ def main(): action='store', type='string', help='Only show URLs matching this host.') - option_parser.add_option('-p', '--path', default=None, + option_parser.add_option('-p', '--full_path', default=None, action='store', type='string', - help='Only show URLs matching this path.') + help='Only show URLs matching this full path.') option_parser.add_option('-f', '--merged_file', default=None, action='store', type='string', @@ -878,18 +909,18 @@ def main(): http_archive = HttpArchive.Load(replay_file) if command == 'ls': - print http_archive.ls(options.command, options.host, options.path) + print http_archive.ls(options.command, options.host, options.full_path) elif command == 'cat': - print http_archive.cat(options.command, options.host, options.path) + print http_archive.cat(options.command, options.host, options.full_path) elif command == 'stats': - print http_archive.stats(options.command, options.host, options.path) + print http_archive.stats(options.command, options.host, options.full_path) elif command == 'merge': if not options.merged_file: print 'Error: Must specify a merged file name (use --merged_file)' return http_archive.merge(options.merged_file, args[2:]) elif command == 'edit': - http_archive.edit(options.command, options.host, options.path) + http_archive.edit(options.command, options.host, options.full_path) http_archive.Persist(replay_file) else: option_parser.error('Unknown command "%s"' % command) |