diff options
Diffstat (limited to 'tools/3rdparty/sitemap_gen-1.4/test_sitemap_gen.py')
-rwxr-xr-x | tools/3rdparty/sitemap_gen-1.4/test_sitemap_gen.py | 765 |
1 files changed, 765 insertions, 0 deletions
diff --git a/tools/3rdparty/sitemap_gen-1.4/test_sitemap_gen.py b/tools/3rdparty/sitemap_gen-1.4/test_sitemap_gen.py new file mode 100755 index 0000000..a02f417 --- /dev/null +++ b/tools/3rdparty/sitemap_gen-1.4/test_sitemap_gen.py @@ -0,0 +1,765 @@ +#!/usr/bin/env python +# +# Copyright (c) 2004, 2005 Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of Google nor the names of its contributors may +# be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# The sitemap_gen.py script is written in Python 2.2 and released to +# the open source community for continuous improvements under the BSD +# 2.0 new license, which can be found at: +# +# http://www.opensource.org/licenses/bsd-license.php +# + +"""Unit tests for sitemap_gen.py, a script for generating sitemaps +for a web server. +""" + +# Please be careful that all syntax used in this file can be parsed on +# Python 1.5 -- this version check is not evaluated until after the +# entire file has been parsed. +import sys +if sys.hexversion < 0x02020000: + print 'This script requires Python 2.2 or later.' + print 'Currently run with version: %s' % sys.version + sys.exit(1) + +import binascii +import fnmatch +import gzip +import os +import tempfile +import unittest +import xml.dom.minidom +import sitemap_gen + +# True and False were introduced in Python2.2.2 +try: + testTrue=True + del testTrue +except NameError: + True=1 + False=0 + + +class URLCounter(object): + """Counts returned URLs, determines how many valid v. invalid we get. + This is a helper for consuming what the many Input* objects produce.""" + def __init__(self, root, print_invalid, expected): + """Input: + root :: root URL for calling the URL's own Validate() + print_invalid :: print to output all invalid URLs + expected :: sequence of wildcard filters to validate against + """ + self._root = root + self._print = print_invalid + self._expected = expected + self._valid = 0 + self._invalid = 0 + #end def __init__ + + def Reset(self): + """Reset our counts without harming the validity filters.""" + self._valid = 0 + self._invalid = 0 + #end def Reset + + def Valid(self): + """Returns number of valid URLs.""" + return self._valid + #end def Valid + + def Invalid(self): + """Returns number of invalid URLs.""" + return self._invalid + #end def Valid + + def Count(self, url, allow_fragment): + """The 'please consume this URL' function called by the URL producer.""" + valid = True + if valid: + valid = url.Validate(self._root, allow_fragment) + if valid: + for filter in self._expected: + valid = fnmatch.fnmatchcase(url.loc, filter) + if valid: + break + if valid: + self._valid = self._valid + 1 + else: + if self._print: + url.Log(prefix='URLCounter', level=0) + self._invalid = self._invalid + 1 + #end def Count +#end class URLCounter + + +class TestSiteMap(unittest.TestCase): + """Tests the sitemap_gen application.""" + + def testTimestampISO8601(self): + """ Checks whether the TimestampISO8601 function works. """ + self.assertEqual(sitemap_gen.TimestampISO8601(23), + '1970-01-01T00:00:23Z') + self.assertEqual(sitemap_gen.TimestampISO8601(549876543), + '1987-06-05T07:29:03Z') + #end def testTimestampISO8601 + + def testExpandPathAttribute(self): + """ Verifies our path globbing function works. """ + temppath = tempfile.mktemp() + tempwild = tempfile.tempdir + if tempwild: + tempwild = tempwild + os.sep + tempwild = tempwild + '*' + try: + open(temppath, 'w').close() + + dict1 = {} + dict2 = {'alpha' : 'beta', 'path' : 'DoesNotExist987654321.xyz'} + dict3 = {'alpha' : 'beta', 'path' : tempwild} + + res1 = sitemap_gen.ExpandPathAttribute(dict1, 'path') + res2 = sitemap_gen.ExpandPathAttribute(dict2, 'path') + res3 = sitemap_gen.ExpandPathAttribute(dict3, 'path') + + self.assertEqual(len(res1), 1) + self.assertEqual(res1[0], dict1) + + self.assertEqual(len(res2), 1) + self.assertEqual(res2[0], dict2) + + self.assert_(len(res3) >= 1) + anymatch = False + for res in res3: + path = res['path'] + if path.find(temppath) >= 0: + anymatch = True + self.assertEqual(res['alpha'], 'beta') + self.assert_(anymatch) + + finally: + os.unlink(temppath) + #end def testExpandPathAttribute + + def testEncoder(self): + """ Tests minimal functionality of the learning Unicode codec """ + ENC_UTF8 = 'UTF-8' + ENC_LATIN1 = 'ISO-8859-1' + ENC_CYRILLIC = 'ISO-8859-5' + + STR1_LATIN1 = 'has an ' + binascii.a2b_hex('FC') + 'mlat' + STR1_UTF8 = 'has an ' + binascii.a2b_hex('C3BC') + 'mlat' + STR1_UCS2 = 'has an ' + unichr(252) + 'mlat' + + STR2_LATIN1 = 'DRAGON' + binascii.a2b_hex('A7') + '!' + STR2_CYRILLIC = 'DRAGON' + binascii.a2b_hex('FD') + '!' + STR2_UCS2 = 'DRAGON' + unichr(167) + '!' + + # Spawn our own encoder instance so we don't abuse the module one. + encoder = sitemap_gen.Encoder() + + # Convert Latin-1 to UTF-8, by way of Unicode + encoder.SetUserEncoding(ENC_LATIN1) + self.assertEqual(encoder.WidenText(STR1_LATIN1, None), STR1_UCS2) + self.assertEqual(encoder.NarrowText(STR1_UCS2, ENC_UTF8), STR1_UTF8) + + # Test learning. STR1 has no Cyrillic equivalent, STR2 just changes. + encoder.SetUserEncoding(None) + encoder._learned = [] + self.assertEqual(encoder.WidenText(STR2_CYRILLIC, ENC_CYRILLIC), STR2_UCS2) + self.assertEqual(encoder.WidenText(STR2_CYRILLIC, None), STR2_UCS2) + self.assertEqual(encoder.NarrowText(STR1_UCS2, None), STR1_UTF8) + self.assert_(not encoder._learned) + self.assertEqual(encoder.NarrowText(STR1_UCS2, ENC_LATIN1), STR1_LATIN1) + self.assertEqual(encoder.NarrowText(STR1_UCS2, None), STR1_LATIN1) + self.assertEqual(encoder.NarrowText(STR2_UCS2, None), STR2_LATIN1) + #end def testEncoder + + def testURL(self): + """ Vigorously tests our URL attribute processing. """ + + # Test the IsAbsolute method + self.assert_(sitemap_gen.URL.IsAbsolute('http://a.b.c/d/e.txt?f=g#h')) + self.assert_(sitemap_gen.URL.IsAbsolute('http://a.b.c')) + self.assert_(not sitemap_gen.URL.IsAbsolute('http:///d/e.txt?f=g#h')) + self.assert_(not sitemap_gen.URL.IsAbsolute('http:a.b.c/d/e.txt?f=g#h')) + self.assert_(not sitemap_gen.URL.IsAbsolute('a.b.c/d/e.txt?f=g#h')) + self.assert_(not sitemap_gen.URL.IsAbsolute('/d/e.txt?f=g#h')) + + # Canonicalize our base URL + BASE_R = 'http://www.example.com/f' + binascii.a2b_hex('F6F6') + '/' + BASE_C = 'http://www.example.com/f%F6%F6/' + sitemap_gen.encoder.SetUserEncoding('ISO-8859-1') + self.assertEqual(sitemap_gen.URL.Canonicalize(BASE_R), BASE_C) + + # Test how canonicalization handles pre-quoted values + self.assertEqual(sitemap_gen.URL.Canonicalize( + 'http://www.example.com/my%25thing'), + 'http://www.example.com/my%25thing') + self.assertEqual(sitemap_gen.URL.Canonicalize( + 'http://www.example.com/my%thing'), + 'http://www.example.com/my%25thing') + + # Test IDNA encoding + # The generator can only do the "right thing" on Python 2.3 or higher + warn = sitemap_gen.output.num_warns + if sys.hexversion >= 0x02030000: + self.assertEqual(sitemap_gen.URL.Canonicalize( + 'http://www.' + unichr(252) + 'mlat.com/' + unichr(252) + 'mlat.txt'), + 'http://www.xn--mlat-zra.com/%FCmlat.txt') + self.assertEqual(sitemap_gen.output.num_warns, warn) + else: + self.assertEqual(sitemap_gen.URL.Canonicalize( + 'http://www.' + unichr(252) + 'mlat.com/' + unichr(252) + 'mlat.txt'), + 'http://www.%FCmlat.com/%FCmlat.txt') + self.assertEqual(sitemap_gen.output.num_warns, warn + 2) + + # All valid data + warn = sitemap_gen.output.num_warns + url1 = sitemap_gen.URL() + url1.TrySetAttribute('loc', BASE_R + 'bar.html') + url1.TrySetAttribute('lastmod', '1987-06-05T07:29:03Z') + url1.TrySetAttribute('changefreq', 'daily') + url1.TrySetAttribute('priority', '0.3') + self.assert_(url1.Validate(BASE_C, True)) + self.assertEqual(sitemap_gen.output.num_warns, warn) + + # Valid ref, all else invalid + warn = sitemap_gen.output.num_warns + url2 = sitemap_gen.URL() + url2.TrySetAttribute('loc', BASE_C + 'bar.html') + url2.TrySetAttribute('lastmod', 'June 1, 2005') + url2.TrySetAttribute('changefreq', 'every second') + url2.TrySetAttribute('priority', 'infinite') + url2.TrySetAttribute('badattr', 'Nope!') + self.assert_(url2.Validate(BASE_C, True)) + self.assertEqual(sitemap_gen.output.num_warns, warn + 4) + + # Two URLs with same ref should compare equal + self.assertEqual(url1, url2) + + # A ref not based + warn = sitemap_gen.output.num_warns + url3 = sitemap_gen.URL() + url3.TrySetAttribute('loc', 'http://www.example.com/bar/foo.html') + self.assert_(not url3.Validate(BASE_C, True)) + self.assertEqual(sitemap_gen.output.num_warns, warn + 1) + + # A fragmentary URL + warn = sitemap_gen.output.num_warns + url4 = sitemap_gen.URL() + url4.TrySetAttribute('loc', '/foo.html') + self.assert_(not url4.Validate(BASE_C, False)) + self.assertEqual(sitemap_gen.output.num_warns, warn + 1) + url4.TrySetAttribute('loc', '/xyzzy/foo.html') + self.assert_(url4.Validate('http://www.example.com/', True)) + self.assertEqual(url4.loc, 'http://www.example.com/xyzzy/foo.html') + self.assertEqual(sitemap_gen.output.num_warns, warn + 1) + + # Test a whole sequence of good and bad timestamp values + timestamps_good = [ + '2001', + '2001-01', + '2001-01-02', + '2001-01-03T01:02Z', + '2001-01-03T01:02:03Z', + '2001-01-03T01:02:03.0123Z', + '2001-01-03T01:02+00:00', + '2001-01-03T01:02:03-99:99', + '2001-01-03T01:02:03.0123+88:88', + ] + timestamps_bad = [ + '2001:01:03T01:02Z', + '2001-01-03T01:02:03.Z', + 'a2001-01-06T01:02:05-99:99', + '2001-01-06T01:02:05-99:99Z', + '2001-1-6T01:02:05-99:99', + 'xyzzy', + '2001-01-03T01:02:03.1.2Z', + ] + warn = sitemap_gen.output.num_warns + url3.TrySetAttribute('loc', BASE_C + 'foo.html') + for ts in timestamps_good: + url3.TrySetAttribute('lastmod', ts) + self.assert_(url3.Validate(BASE_C, True)) + self.assertEqual(sitemap_gen.output.num_warns, warn) + for ts in timestamps_bad: + url3.TrySetAttribute('lastmod', ts) + self.assert_(url3.Validate(BASE_C, True)) + self.assertEqual(sitemap_gen.output.num_warns, warn + len(timestamps_bad)) + #end def testURL + + def testFilter(self): + """ Test the filtering object """ + url1 = sitemap_gen.URL() + url2 = sitemap_gen.URL() + url1.TrySetAttribute('loc', 'http://www.example.com/foo/bar.html') + url2.TrySetAttribute('loc', 'http://www.example.com/bar/foo.html') + url1.Validate('http://www.example.com', True) + url2.Validate('http://www.example.com', True) + + # Arguments + error = sitemap_gen.output.num_errors + args_bad = [ + {}, + {'pattern' : '*', 'type' : 'unknown'}, + {'pattern' : '*', 'type' : 'wildcard', 'action' : 'look pretty'}, + {'pattern' : '*', 'type' : 'regexp'}, + ] + error = sitemap_gen.output.num_errors + for args in args_bad: + sitemap_gen.Filter(args) + self.assertEqual(sitemap_gen.output.num_errors, error + len(args_bad)) + + # Wildcard + filt_w = sitemap_gen.Filter({'pattern' : '*/foo/*', 'type' : 'wildcard' }) + self.assertEqual(filt_w.Apply(url1), False) + self.assertEqual(filt_w.Apply(url2), None) + + # Regexp + filt_r = sitemap_gen.Filter({'pattern' : '/bar/[^/]+$', 'type' : 'REGEXP', + 'action' : 'PASS'}) + self.assertEqual(filt_r.Apply(url1), None) + self.assertEqual(filt_r.Apply(url2), True) + #end def testFilter + + def Count(self, url, allow_fragment): + if url.Validate('http://www.example.com/', allow_fragment): + self.valid_urls = self.valid_urls + 1 + else: + self.invalid_urls = self.invalid_urls + 1 + #end def Count + valid_urls = 0 + invalid_urls = 0 + + def testInputURL(self): + """ Test one of the Input mechanisms: InputURL """ + + # Feed a couple URLs. Make sure we get an error on extra attributes. + self.valid_urls = 0 + self.invalid_urls = 0 + error = sitemap_gen.output.num_errors + warn = sitemap_gen.output.num_warns + generator1 = sitemap_gen.InputURL({'href' : 'http://www.example.com/1', + 'priority' : '0.3', + 'lastmod' : '2004-11-14T01:00-07:00', + 'changefreq' : 'hourly', + 'unknownInURL' : 'attribute'}) + generator2 = sitemap_gen.InputURL({'href' : 'http://www.example.com/2', + 'priority' : '0.3', + 'lastmod' : '2004-11-14T01:00-07:00', + 'changefreq' : 'hourly'}) + generator1.ProduceURLs(self.Count) + generator2.ProduceURLs(self.Count) + self.assertEqual(self.valid_urls, 1) + self.assertEqual(self.invalid_urls, 0) + self.assertEqual(sitemap_gen.output.num_errors, error + 1) + self.assertEqual(sitemap_gen.output.num_warns, warn) + #end def testInputURL + + def testInputURLList(self): + """ Test one of the Input mechanisms: InputURLList """ + path = tempfile.mktemp() + file = open(path, 'w') + + try: + # Create a temp file we can read + testText = """ +http://www.example.com/foo/bar unknownInURLList=attribute +http://www.example.com/foo/xxx.pdf lastmod=2003-12-31T14:05:06+00:00 +http://www.example.com/foo/yyy?x=12&y=23 changefreq=weekly priority=0.3 + """ + file.write(testText) + file.close() + + # Feed in the data. Make sure we get a warning on the bad attribute. + self.valid_urls = 0 + self.invalid_urls = 0 + warn = sitemap_gen.output.num_warns + generator = sitemap_gen.InputURLList({'path' : path}) + generator.ProduceURLs(self.Count) + self.assertEqual(self.valid_urls, 3) + self.assertEqual(self.invalid_urls, 0) + self.assertEqual(sitemap_gen.output.num_warns, warn + 1) + + finally: + os.unlink(path) + #end def testInputURLList + + def testInputDirectory(self): + """Test one of the Input mechanisms: InputDirectory. + I've seen a subtle path-bug appear when going into sub-sub-directories + that didn't under just sub-directories. So we go to the trouble to + make a whole little directory tree to read. + """ + counter = URLCounter('http://www.example.com/', True, ( + 'http://www.example.com/', + 'http://www.example.com/one.html', + 'http://www.example.com/two.html', + 'http://www.example.com/xyzzy/', + 'http://www.example.com/xyzzy/thr.html', + 'http://www.example.com/xyzzy/zyxxy/', + 'http://www.example.com/xyzzy/zyxxy/fiv.html', + )) + path = tempfile.mktemp() + subpath = os.path.join(path, 'xyzzy') + subsubpath = os.path.join(subpath, 'zyxxy') + + try: + # Create some dummy empty files + os.mkdir(path) + os.mkdir(subpath) + os.mkdir(subsubpath) + path_one = os.path.join(path, 'one.html') + path_two = os.path.join(path, 'two.html') + path_thr = os.path.join(subpath, 'thr.html') + path_for = os.path.join(subpath, 'default.html') + path_fiv = os.path.join(subsubpath, 'fiv.html') + open(path_one, 'w').close() + open(path_two, 'w').close() + open(path_thr, 'w').close() + open(path_for, 'w').close() + open(path_fiv, 'w').close() + + # Feed in the data. There should be no warnings. + warn = sitemap_gen.output.num_warns + generator = sitemap_gen.InputDirectory({'path' : path, + 'url' : 'http://www.example.com/', 'default_file' : 'default.html'}, + 'http://www.example.com/') + generator.ProduceURLs(counter.Count) + self.assertEqual(counter.Valid(), 7) + self.assertEqual(counter.Invalid(), 0) + self.assertEqual(sitemap_gen.output.num_warns, warn) + + finally: + os.unlink(path_one) + os.unlink(path_two) + os.unlink(path_thr) + os.unlink(path_for) + os.unlink(path_fiv) + os.rmdir(subsubpath) + os.rmdir(subpath) + os.rmdir(path) + #end def testInputDirectory + + def testInputAccessLogCLF(self): + """ Test one of the Input mechanisms: InputAccessLog (Common logfile) """ + path = tempfile.mktemp() + file = open(path, 'w') + + try: + # Create a temp file we can read + testText = ''' +msnbot.msn.com - - [15/May/2005:07:46:50 -0700] "GET /~guest/main/ HTTP/1.0" 200 5670 +221.216.237.71 - - [15/May/2005:07:59:25 -0700] "GET /~guest/bookmark/ HTTP/1.1" 200 39195 +221.216.237.71 - - [15/May/2005:07:59:27 -0700] "GET /favicon.ico HTTP/1.1" 404 217 +c-67-161-121-105.hsd1.wa.comcast.net - - [15/May/2005:11:17:23 -0700] "GET /picts/top.jpg HTTP/1.1" 200 10044 +cpe-65-24-155-46.columbus.res.rr.com - - [16/May/2005:22:53:07 -0700] "HEAD http://www.example.com/~guest HTTP/1.1" 200 0 + ''' + file.write(testText) + file.close() + + # Feed in the data + self.valid_urls = 0 + self.invalid_urls = 0 + warn = sitemap_gen.output.num_warns + generator = sitemap_gen.InputAccessLog({'path' : path}) + generator.ProduceURLs(self.Count) + self.assertEqual(self.valid_urls, 4) + self.assertEqual(self.invalid_urls, 0) + self.assertEqual(sitemap_gen.output.num_warns, warn) + + finally: + os.unlink(path) + #end def testInputAccessLogCLF + + def testInputAccessLogELF(self): + """ Test one of the Input mechanisms: InputAccessLog (Extended logfile) """ + path = tempfile.mktemp() + file = open(path, 'w') + + try: + # Create a temp file we can read + testText = ''' +#Software: Microsoft Internet Information Services 6.0 +#Version: 1.0 +#Date: 2004-03-22 09:20:36 +#Fields: date time s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-w +in32-status +2004-03-22 09:20:36 192.168.0.58 GET /Default.htm - 80 - 4.5.11.3 Mozilla/4.0+(compatible;+MSIE+5.5;+Windows+98) 200 0 64 +2004-03-22 09:22:58 192.168.0.58 GET /Default.htm - 80 - 24.87.160.82 Mozilla/4.0+(compatible;+MSIE+5.5;+Windows+98) 200 0 6 +4 + ''' + file.write(testText) + file.close() + + # Feed in the data + self.valid_urls = 0 + self.invalid_urls = 0 + warn = sitemap_gen.output.num_warns + generator = sitemap_gen.InputAccessLog({'path' : path}) + generator.ProduceURLs(self.Count) + self.assertEqual(self.valid_urls, 2) + self.assertEqual(self.invalid_urls, 0) + self.assertEqual(sitemap_gen.output.num_warns, warn) + + finally: + os.unlink(path) + #end def testInputAccessLogELF + + def testInputSitemap(self): + """ Test one of the Input mechanisms: InputSitemap """ + path1 = tempfile.mktemp('.xml') + path2 = tempfile.mktemp('.xml') + path3 = tempfile.mktemp('.xml') + path4 = tempfile.mktemp('.xml') + file1 = None + file2 = None + file3 = None + file4 = None + + index = '''<?xml version="1.0" encoding="UTF-8"?> +<sitemapindex + xmlns="http://www.google.com/schemas/sitemap/0.84" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 + http://www.google.com/schemas/sitemap/0.84/siteindex.xsd"> + <sitemap> + <loc>http://www.example.com/path/to/%(PATH2)s</loc> + <lastmod>2005-07-15T17:41:22Z</lastmod> + </sitemap> + <sitemap> + <loc>http://www.example.com/path/to/%(PATH3)s</loc> + <lastmod>2005-07-15T17:41:22Z</lastmod> + </sitemap> +</sitemapindex> +''' + content1 = '''<?xml version="1.0" encoding="UTF-8"?> +<urlset + xmlns="http://www.google.com/schemas/sitemap/0.84" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 + http://www.google.com/schemas/sitemap/0.84/sitemap.xsd"> + <url> + <loc>http://www.example.com/another/path/to/samplefile1.html</loc> + <lastmod>2005-07-13T00:00:12Z</lastmod> + <priority>0.5000</priority> + </url> + <url> + <loc>http://www.example.com/another/path/to/samplefile2.html</loc> + <lastmod>2004-11-16T20:22:06Z</lastmod> + <priority>0.5000</priority> + </url> +</urlset> +''' + content2 = '''<?xml version="1.0" encoding="UTF-8"?> +<urlset + xmlns="http://www.google.com/schemas/sitemap/0.84" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 + http://www.google.com/schemas/sitemap/0.84/sitemap.xsd"> + <url badSitemapAttr="Hello, World!"> + <loc>http://www.example.com/another/path/to/samplefile3.html</loc> + <lastmod>2005-07-13T00:00:12Z</lastmod> + <priority>0.5000</priority> + </url> + <url> + <loc>http://www.example.com/another/path/to/samplefile4.html</loc> + <lastmod>2004-11-16T20:22:06Z</lastmod> + <priority>0.5000</priority> + </url> +</urlset> +''' + # This index is invalid because it points to another index file. + badindex = '''<?xml version="1.0" encoding="UTF-8"?> +<sitemapindex + xmlns="http://www.google.com/schemas/sitemap/0.84" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 + http://www.google.com/schemas/sitemap/0.84/siteindex.xsd"> + <sitemap> + <loc>http://www.example.com/path/to/%(PATH2)s</loc> + <lastmod>2005-07-15T17:41:22Z</lastmod> + </sitemap> + <sitemap> + <loc>http://www.example.com/path/to/%(PATH1)s</loc> + <lastmod>2005-07-15T17:41:22Z</lastmod> + </sitemap> +</sitemapindex> +''' + + # Make a nice complicated set of two index files and two sitemaps. + try: + file1 = open(path1, 'wt') + file2 = open(path2, 'wt') + file3 = open(path3, 'wt') + file4 = open(path4, 'wt') + file1.write(index % { + 'PATH1' : os.path.basename(path1), + 'PATH2' : os.path.basename(path2), + 'PATH3' : os.path.basename(path3)}) + file2.write(content1) + file3.write(content2) + file4.write(badindex % { + 'PATH1' : os.path.basename(path1), + 'PATH2' : os.path.basename(path2), + 'PATH3' : os.path.basename(path3)}) + file1.close() + file1 = None + file2.close() + file2 = None + file3.close() + file3 = None + file4.close() + file4 = None + + # Feed in the good data. Make sure we get warned on the bad attribute. + self.valid_urls = 0 + self.invalid_urls = 0 + warn = sitemap_gen.output.num_warns + generator = sitemap_gen.InputSitemap({'path' : path1}) + generator.ProduceURLs(self.Count) + self.assertEqual(self.valid_urls, 4) + self.assertEqual(self.invalid_urls, 0) + self.assertEqual(sitemap_gen.output.num_warns, warn + 1) + + # Feed in the bad data. Should error once on the bad index and once + # because it aborts processing the XML. + self.valid_urls = 0 + self.invalid_urls = 0 + errors = sitemap_gen.output.num_errors + generator = sitemap_gen.InputSitemap({'path' : path4}) + generator.ProduceURLs(self.Count) + self.assertEqual(self.valid_urls, 2) + self.assertEqual(self.invalid_urls, 0) + self.assertEqual(sitemap_gen.output.num_errors, errors + 2) + + finally: + if file1 is not None: + file1.close() + if file2 is not None: + file2.close() + if file3 is not None: + file3.close() + if os.path.exists(path1): + os.unlink(path1) + if os.path.exists(path2): + os.unlink(path2) + if os.path.exists(path3): + os.unlink(path3) + #end def testInputSitemap + + def testFilePathGenerator(self): + """ Test our iteration of filenames """ + gen1 = sitemap_gen.FilePathGenerator() + gen2 = sitemap_gen.FilePathGenerator() + gen3 = sitemap_gen.FilePathGenerator() + self.assert_(gen1.Preload('/tmp/bar/foo.xml')) + self.assert_(gen2.Preload('foo.xml.gz')) + self.assert_(gen3.Preload('/foo.gz')) + self.assert_(not gen1.is_gzip) + self.assert_( gen2.is_gzip) + self.assert_( gen3.is_gzip) + self.assertEqual(gen1.GeneratePath(0), + os.path.normpath('/tmp/bar/foo.xml')) + self.assertEqual(gen2.GeneratePath(1),'foo1.xml.gz') + self.assertEqual(gen1.GeneratePath('_index.xml'), + os.path.normpath('/tmp/bar/foo_index.xml')) + self.assertEqual(gen1.GenerateURL('_index.xml', 'http://www.example.com/'), + 'http://www.example.com/foo_index.xml') + self.assertEqual(gen1.GenerateURL(2, 'http://www.example.com/'), + 'http://www.example.com/foo2.xml') + self.assertEqual(gen2.GenerateWildURL('http://www.example.com/'), + 'http://www.example.com/foo*.xml.gz') + #end def testFilePathGenerator + + def testSitemap(self): + """Test a basic config of the overall sitemap class.""" + path1 = tempfile.mktemp() + path2 = tempfile.mktemp(".xml.gz") + file = open(path1, 'w') + + try: + # Create a temp file we can read + testText = '''<?xml version="1.0" encoding="UTF-8"?> +<site + base_url="http://www.example.com/" + store_into="%(OUTPUTFILENAME)s" + default_encoding="UTF-8" + verbose="3" +> + <url href="http://www.example.com/.htaccess" /> + <url href="http://www.example.com/foo/bar.html" /> + <url href="http://www.example.com/foo/bar.gif" /> + <url href="http://www.example.com/foo/bar.html" /> + <url href="http://www.example.com/percent%%%%percent.html" /> + <url href="http://www.example.com/ümlat.html" /> + <filter action="drop" type="regexp" pattern="/\.[^/]*$" /> +</site> +''' + file.write(testText % {'OUTPUTFILENAME' : path2}) + file.close() + + # Bring up the engine + warn = sitemap_gen.output.num_warns + error = sitemap_gen.output.num_errors + sitemap = sitemap_gen.CreateSitemapFromFile(path1, True) + self.assert_(sitemap) + sitemap.Generate() + self.assertEqual(sitemap_gen.output.num_warns, warn) + self.assertEqual(sitemap_gen.output.num_errors, error) + + # Verify we got readable XML out of it + file = gzip.open(path2, mode='rb') + result = file.read() + file.close() + dom = xml.dom.minidom.parseString(result) + self.assertEqual(len(dom.getElementsByTagName('url')), 4) + self.assert_(result.find('http://www.example.com/foo/bar.html') > 0) + self.assert_(result.find('http://www.example.com/foo/bar.gif') > 0) + self.assert_(result.find('%25%25') > 0) + self.assert_(result.find('%C3%BC') > 0) + finally: + if os.path.exists(path2): + os.unlink(path2) + os.unlink(path1) + #end def testSitemap + +#end class TestSiteMap + +# +# __main__ +# + +if __name__ == '__main__': + unittest.main() |