diff options
author | Giuseppe D'Angelo <giuseppe.dangelo@kdab.com> | 2018-05-06 20:30:33 +0200 |
---|---|---|
committer | Tero Kojo <tero.kojo@qt.io> | 2018-05-07 07:40:52 +0000 |
commit | 58f1f8a6411926488342d2f9872811520d8eb580 (patch) | |
tree | b8b3d93568551ddc4f489fa891e0be1c195533c9 | |
parent | c7161128b2c74c92a631a4caea518bb41759f516 (diff) |
Upgrade to rawdog 2.22
Change-Id: Iccd32db1068ca65ee1e6de88ad137fa3950efde5
Reviewed-by: Tero Kojo <tero.kojo@qt.io>
-rwxr-xr-x | rawdog | 7 | ||||
-rw-r--r-- | rawdoglib/__init__.py | 6 | ||||
-rw-r--r-- | rawdoglib/feedfinder.py | 366 | ||||
-rw-r--r-- | rawdoglib/feedscanner.py | 137 | ||||
-rw-r--r-- | rawdoglib/persister.py | 221 | ||||
-rw-r--r-- | rawdoglib/plugins.py | 14 | ||||
-rw-r--r-- | rawdoglib/rawdog.py | 1176 |
7 files changed, 1033 insertions, 894 deletions
@@ -1,6 +1,6 @@ #!/usr/bin/env python # rawdog: RSS aggregator without delusions of grandeur. -# Copyright 2003, 2004, 2005, 2006 Adam Sampson <ats@offog.org> +# Copyright 2003, 2004, 2005, 2006, 2016 Adam Sampson <ats@offog.org> # # rawdog is free software; you can redistribute and/or modify it # under the terms of that license as published by the Free Software @@ -18,7 +18,9 @@ # MA 02110-1301, USA, or see http://www.gnu.org/. from rawdoglib.rawdog import main -import sys, os + +import os +import sys def launch(): sys.exit(main(sys.argv[1:])) @@ -29,4 +31,3 @@ if __name__ == "__main__": profile.run("launch()") else: launch() - diff --git a/rawdoglib/__init__.py b/rawdoglib/__init__.py index c6744f7..382e2c5 100644 --- a/rawdoglib/__init__.py +++ b/rawdoglib/__init__.py @@ -1 +1,5 @@ -__all__ = ['feedparser', 'feedfinder', 'timeoutsocket', 'rawdog', 'persister', 'upgrade_1_2'] +__all__ = [ + 'feedscanner', + 'persister', + 'rawdog', + ] diff --git a/rawdoglib/feedfinder.py b/rawdoglib/feedfinder.py deleted file mode 100644 index b4fd28e..0000000 --- a/rawdoglib/feedfinder.py +++ /dev/null @@ -1,366 +0,0 @@ -"""feedfinder: Find the Web feed for a Web page -http://www.aaronsw.com/2002/feedfinder/ - -Usage: - feed(uri) - returns feed found for a URI - feeds(uri) - returns all feeds found for a URI - - >>> import feedfinder - >>> feedfinder.feed('scripting.com') - 'http://scripting.com/rss.xml' - >>> - >>> feedfinder.feeds('scripting.com') - ['http://delong.typepad.com/sdj/atom.xml', - 'http://delong.typepad.com/sdj/index.rdf', - 'http://delong.typepad.com/sdj/rss.xml'] - >>> - -Can also use from the command line. Feeds are returned one per line: - - $ python feedfinder.py diveintomark.org - http://diveintomark.org/xml/atom.xml - -How it works: - 0. At every step, feeds are minimally verified to make sure they are really feeds. - 1. If the URI points to a feed, it is simply returned; otherwise - the page is downloaded and the real fun begins. - 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery) - 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or - ".atom" - 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom" - 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or - ".atom" - 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom" - 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.). - 8. As a last ditch effort, we search Syndic8 for feeds matching the URI -""" - -__version__ = "1.371" -__date__ = "2006-04-24" -__maintainer__ = "Aaron Swartz (me@aaronsw.com)" -__author__ = "Mark Pilgrim (http://diveintomark.org)" -__copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz" -__license__ = "Python" -__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity -Also Jason Diamond, Brian Lalor for bug reporting and patches""" - -_debug = 0 - -import sgmllib, urllib, urlparse, re, sys, robotparser - -import threading -class TimeoutError(Exception): pass -def timelimit(timeout): - """borrowed from web.py""" - def _1(function): - def _2(*args, **kw): - class Dispatch(threading.Thread): - def __init__(self): - threading.Thread.__init__(self) - self.result = None - self.error = None - - self.setDaemon(True) - self.start() - - def run(self): - try: - self.result = function(*args, **kw) - except: - self.error = sys.exc_info() - - c = Dispatch() - c.join(timeout) - if c.isAlive(): - raise TimeoutError, 'took too long' - if c.error: - raise c.error[0], c.error[1] - return c.result - return _2 - return _1 - -# XML-RPC support allows feedfinder to query Syndic8 for possible matches. -# Python 2.3 now comes with this module by default, otherwise you can download it -try: - import xmlrpclib # http://www.pythonware.com/products/xmlrpc/ -except ImportError: - xmlrpclib = None - -if not dict: - def dict(aList): - rc = {} - for k, v in aList: - rc[k] = v - return rc - -def _debuglog(message): - if _debug: print message - -class URLGatekeeper: - """a class to track robots.txt rules across multiple servers""" - def __init__(self): - self.rpcache = {} # a dictionary of RobotFileParser objects, by domain - self.urlopener = urllib.FancyURLopener() - self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://www.aaronsw.com/2002/feedfinder/" - _debuglog(self.urlopener.version) - self.urlopener.addheaders = [('User-agent', self.urlopener.version)] - robotparser.URLopener.version = self.urlopener.version - robotparser.URLopener.addheaders = self.urlopener.addheaders - - def _getrp(self, url): - protocol, domain = urlparse.urlparse(url)[:2] - if self.rpcache.has_key(domain): - return self.rpcache[domain] - baseurl = '%s://%s' % (protocol, domain) - robotsurl = urlparse.urljoin(baseurl, 'robots.txt') - _debuglog('fetching %s' % robotsurl) - rp = robotparser.RobotFileParser(robotsurl) - try: - rp.read() - except: - pass - self.rpcache[domain] = rp - return rp - - def can_fetch(self, url): - rp = self._getrp(url) - allow = rp.can_fetch(self.urlopener.version, url) - _debuglog("gatekeeper of %s says %s" % (url, allow)) - return allow - - @timelimit(10) - def get(self, url, check=True): - if check and not self.can_fetch(url): return '' - try: - return self.urlopener.open(url).read() - except: - return '' - -_gatekeeper = URLGatekeeper() - -class BaseParser(sgmllib.SGMLParser): - def __init__(self, baseuri): - sgmllib.SGMLParser.__init__(self) - self.links = [] - self.baseuri = baseuri - - def normalize_attrs(self, attrs): - def cleanattr(v): - v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v) - v = v.strip() - v = v.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&') - return v - attrs = [(k.lower(), cleanattr(v)) for k, v in attrs] - attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs] - return attrs - - def do_base(self, attrs): - attrsD = dict(self.normalize_attrs(attrs)) - if not attrsD.has_key('href'): return - self.baseuri = attrsD['href'] - - def error(self, *a, **kw): pass # we're not picky - -class LinkParser(BaseParser): - FEED_TYPES = ('application/rss+xml', - 'text/xml', - 'application/atom+xml', - 'application/x.atom+xml', - 'application/x-atom+xml') - def do_link(self, attrs): - attrsD = dict(self.normalize_attrs(attrs)) - if not attrsD.has_key('rel'): return - rels = attrsD['rel'].split() - if 'alternate' not in rels: return - if attrsD.get('type') not in self.FEED_TYPES: return - if not attrsD.has_key('href'): return - self.links.append(urlparse.urljoin(self.baseuri, attrsD['href'])) - -class ALinkParser(BaseParser): - def start_a(self, attrs): - attrsD = dict(self.normalize_attrs(attrs)) - if not attrsD.has_key('href'): return - self.links.append(urlparse.urljoin(self.baseuri, attrsD['href'])) - -def makeFullURI(uri): - uri = uri.strip() - if uri.startswith('feed://'): - uri = 'http://' + uri.split('feed://', 1).pop() - for x in ['http', 'https']: - if uri.startswith('%s://' % x): - return uri - return 'http://%s' % uri - -def getLinks(data, baseuri): - p = LinkParser(baseuri) - p.feed(data) - return p.links - -def getALinks(data, baseuri): - p = ALinkParser(baseuri) - p.feed(data) - return p.links - -def getLocalLinks(links, baseuri): - baseuri = baseuri.lower() - urilen = len(baseuri) - return [l for l in links if l.lower().startswith(baseuri)] - -def isFeedLink(link): - return link[link.rfind('.'):].lower() in ('.rss', '.rdf', '.xml', '.atom') - -def isXMLRelatedLink(link): - link = link.lower() - return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom') - -r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S) -def tryBrokenRedirect(data): - if '<newLocation' in data: - newuris = r_brokenRedirect.findall(data) - if newuris: return newuris[0].strip() - -def couldBeFeedData(data): - data = data.lower() - if data.count('<html'): return 0 - return data.count('<rss') + data.count('<rdf') + data.count('<feed') - -def isFeed(uri): - _debuglog('seeing if %s is a feed' % uri) - protocol = urlparse.urlparse(uri) - if protocol[0] not in ('http', 'https'): return 0 - data = _gatekeeper.get(uri) - return couldBeFeedData(data) - -def sortFeeds(feed1Info, feed2Info): - return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank']) - -def getFeedsFromSyndic8(uri): - feeds = [] - try: - server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php') - feedids = server.syndic8.FindFeeds(uri) - infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl']) - infolist.sort(sortFeeds) - feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated'] - _debuglog('found %s feeds through Syndic8' % len(feeds)) - except: - pass - return feeds - -def feeds(uri, all=False, querySyndic8=False, _recurs=None): - if _recurs is None: _recurs = [uri] - fulluri = makeFullURI(uri) - try: - data = _gatekeeper.get(fulluri, check=False) - except: - return [] - # is this already a feed? - if couldBeFeedData(data): - return [fulluri] - newuri = tryBrokenRedirect(data) - if newuri and newuri not in _recurs: - _recurs.append(newuri) - return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs) - # nope, it's a page, try LINK tags first - _debuglog('looking for LINK tags') - try: - outfeeds = getLinks(data, fulluri) - except: - outfeeds = [] - _debuglog('found %s feeds through LINK tags' % len(outfeeds)) - outfeeds = filter(isFeed, outfeeds) - if all or not outfeeds: - # no LINK tags, look for regular <A> links that point to feeds - _debuglog('no LINK tags, looking at A tags') - try: - links = getALinks(data, fulluri) - except: - links = [] - locallinks = getLocalLinks(links, fulluri) - # look for obvious feed links on the same server - outfeeds.extend(filter(isFeed, filter(isFeedLink, locallinks))) - if all or not outfeeds: - # look harder for feed links on the same server - outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, locallinks))) - if all or not outfeeds: - # look for obvious feed links on another server - outfeeds.extend(filter(isFeed, filter(isFeedLink, links))) - if all or not outfeeds: - # look harder for feed links on another server - outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, links))) - if all or not outfeeds: - _debuglog('no A tags, guessing') - suffixes = [ # filenames used by popular software: - 'atom.xml', # blogger, TypePad - 'index.atom', # MT, apparently - 'index.rdf', # MT - 'rss.xml', # Dave Winer/Manila - 'index.xml', # MT - 'index.rss' # Slash - ] - outfeeds.extend(filter(isFeed, [urlparse.urljoin(fulluri, x) for x in suffixes])) - if (all or not outfeeds) and querySyndic8: - # still no luck, search Syndic8 for feeds (requires xmlrpclib) - _debuglog('still no luck, searching Syndic8') - outfeeds.extend(getFeedsFromSyndic8(uri)) - if hasattr(__builtins__, 'set') or __builtins__.has_key('set'): - outfeeds = list(set(outfeeds)) - return outfeeds - -getFeeds = feeds # backwards-compatibility - -def feed(uri): - #todo: give preference to certain feed formats - feedlist = feeds(uri) - if feedlist: - return feedlist[0] - else: - return None - -##### test harness ###### - -def test(): - uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html' - failed = [] - count = 0 - while 1: - data = _gatekeeper.get(uri) - if data.find('Atom autodiscovery test') == -1: break - sys.stdout.write('.') - sys.stdout.flush() - count += 1 - links = getLinks(data, uri) - if not links: - print '\n*** FAILED ***', uri, 'could not find link' - failed.append(uri) - elif len(links) > 1: - print '\n*** FAILED ***', uri, 'found too many links' - failed.append(uri) - else: - atomdata = urllib.urlopen(links[0]).read() - if atomdata.find('<link rel="alternate"') == -1: - print '\n*** FAILED ***', uri, 'retrieved something that is not a feed' - failed.append(uri) - else: - backlink = atomdata.split('href="').pop().split('"')[0] - if backlink != uri: - print '\n*** FAILED ***', uri, 'retrieved wrong feed' - failed.append(uri) - if data.find('<link rel="next" href="') == -1: break - uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0]) - print - print count, 'tests executed,', len(failed), 'failed' - -if __name__ == '__main__': - args = sys.argv[1:] - if args and args[0] == '--debug': - _debug = 1 - args.pop(0) - if args: - uri = args[0] - else: - uri = 'http://diveintomark.org/' - if uri == 'test': - test() - else: - print "\n".join(getFeeds(uri)) diff --git a/rawdoglib/feedscanner.py b/rawdoglib/feedscanner.py new file mode 100644 index 0000000..e0034b2 --- /dev/null +++ b/rawdoglib/feedscanner.py @@ -0,0 +1,137 @@ +"""Scan a URL's contents to find related feeds + +This is a compatible replacement for Aaron Swartz's feedfinder module, +using feedparser to check whether the URLs it returns are feeds. + +It finds links to feeds within the following elements: +- <link rel="alternate" ...> (standard feed discovery) +- <a ...>, if the href contains words that suggest it might be a feed + +It orders feeds using a quality heuristic: the first result is the most +likely to be a feed for the given URL. + +Required: Python 2.4 or later, feedparser +""" + +__license__ = """ +Copyright (c) 2008 Decklin Foster <decklin@red-bean.com> +Copyright (c) 2013, 2015 Adam Sampson <ats@offog.org> + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted, provided that +the above copyright notice and this permission notice appear in all +copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL +DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA +OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. +""" + +import cStringIO +import feedparser +import gzip +import re +import urllib2 +import urlparse +import HTMLParser + +def is_feed(url): + """Return true if feedparser can understand the given URL as a feed.""" + + p = feedparser.parse(url) + version = p.get("version") + if version is None: + version = "" + return version != "" + +def fetch_url(url): + """Fetch the given URL and return the data from it as a Unicode string.""" + + request = urllib2.Request(url) + request.add_header("Accept-Encoding", "gzip") + + f = urllib2.urlopen(request) + headers = f.info() + data = f.read() + f.close() + + # We have to support gzip encoding because some servers will use it + # even if you explicitly refuse it in Accept-Encoding. + encodings = headers.get("Content-Encoding", "") + encodings = [s.strip() for s in encodings.split(",")] + if "gzip" in encodings: + f = gzip.GzipFile(fileobj=cStringIO.StringIO(data)) + data = f.read() + f.close() + + # Silently ignore encoding errors -- we don't need to go to the bother of + # detecting the encoding properly (like feedparser does). + data = data.decode("UTF-8", "ignore") + + return data + +class FeedFinder(HTMLParser.HTMLParser): + def __init__(self, base_uri): + HTMLParser.HTMLParser.__init__(self) + self.found = [] + self.count = 0 + self.base_uri = base_uri + + def add(self, score, href): + url = urlparse.urljoin(self.base_uri, href) + lower = url.lower() + + # Some sites provide feeds both for entries and comments; + # prefer the former. + if lower.find("comment") != -1: + score -= 50 + + # Prefer Atom, then RSS, then RDF (RSS 1). + if lower.find("atom") != -1: + score += 10 + elif lower.find("rss2") != -1: + score -= 5 + elif lower.find("rss") != -1: + score -= 10 + elif lower.find("rdf") != -1: + score -= 15 + + self.found.append((-score, self.count, url)) + self.count += 1 + + def urls(self): + return [link[2] for link in sorted(self.found)] + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + href = attrs.get('href') + if href is None: + return + if tag == 'link' and attrs.get('rel') == 'alternate' and \ + not attrs.get('type') == 'text/html': + self.add(200, href) + if tag == 'a' and re.search(r'\b(rss|atom|rdf|feeds?)\b', href, re.I): + self.add(100, href) + +def feeds(page_url): + """Search the given URL for possible feeds, returning a list of them.""" + + # If the URL is a feed, there's no need to scan it for links. + if is_feed(page_url): + return [page_url] + + data = fetch_url(page_url) + parser = FeedFinder(page_url) + try: + parser.feed(data) + except HTMLParser.HTMLParseError: + pass + found = parser.urls() + + # Return only feeds that feedparser can understand. + return [feed for feed in found if is_feed(feed)] diff --git a/rawdoglib/persister.py b/rawdoglib/persister.py index 6c06e2c..40169da 100644 --- a/rawdoglib/persister.py +++ b/rawdoglib/persister.py @@ -1,87 +1,192 @@ -# persister: safe class persistance wrapper -# Copyright 2003, 2004, 2005 Adam Sampson <ats@offog.org> +# persister: persist Python objects safely to pickle files +# Copyright 2003, 2004, 2005, 2013, 2014 Adam Sampson <ats@offog.org> # -# persister is free software; you can redistribute it and/or modify it -# under the terms of the GNU Lesser General Public License as -# published by the Free Software Foundation; either version 2.1 of the -# License, or (at your option) any later version. +# rawdog is free software; you can redistribute and/or modify it +# under the terms of that license as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) +# any later version. # -# persister is distributed in the hope that it will be useful, but +# rawdog is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. # -# You should have received a copy of the GNU Lesser General Public -# License along with persister; see the file COPYING.LGPL. If not, -# write to the Free Software Foundation, Inc., 51 Franklin Street, -# Fifth Floor, Boston, MA 02110-1301, USA, or see http://www.gnu.org/. +# You should have received a copy of the GNU General Public License +# along with rawdog; see the file COPYING. If not, write to the Free +# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA, or see http://www.gnu.org/. -import fcntl, os, errno import cPickle as pickle +import errno +import fcntl +import os +import sys class Persistable: - """Something which can be persisted. When a subclass of this wants to - indicate that it has been modified, it should call - self.modified().""" - def __init__(self): self._modified = False - def modified(self, state = True): self._modified = state - def is_modified(self): return self._modified + """An object which can be persisted.""" -class Persister: - """Persist another class to a file, safely. The class being persisted - must derive from Persistable (although this isn't enforced).""" + def __init__(self): + self._modified = False - def __init__(self, filename, klass, use_locking = True): - self.filename = filename + def modified(self, state=True): + """Mark the object as having been modified (or not).""" + self._modified = state + + def is_modified(self): + return self._modified + +class Persisted: + """Context manager for a persistent object. The object being persisted + must implement the Persistable interface.""" + + def __init__(self, klass, filename, persister): self.klass = klass - self.use_locking = use_locking - self.file = None + self.filename = filename + self.persister = persister + self.lock_file = None self.object = None + self.refcount = 0 + + def rename(self, new_filename): + """Rename the persisted file. This works whether the file is + currently open or not.""" + + self.persister._rename(self.filename, new_filename) + for ext in ("", ".lock"): + try: + os.rename(self.filename + ext, + new_filename + ext) + except OSError, e: + # If the file doesn't exist (yet), + # that's OK. + if e.errno != errno.ENOENT: + raise e + self.filename = new_filename + + def __enter__(self): + """As open().""" + return self.open() - def load(self, no_block = True): - """Load the persisted object from the file, or create a new one - if this isn't possible. Returns the loaded object.""" + def __exit__(self, type, value, tb): + """As close(), unless an exception occurred in which case do + nothing.""" + if tb is None: + self.close() - def get_lock(): - if not self.use_locking: - return True + def open(self, no_block=True): + """Return the persistent object, loading it from its file if it + isn't already open. You must call close() once you're finished + with the object. + + If no_block is True, then this will return None if loading the + object would otherwise block (i.e. if it's locked by another + process).""" + + if self.refcount > 0: + # Already loaded. + self.refcount += 1 + return self.object + + try: + self._open(no_block) + except KeyboardInterrupt: + sys.exit(1) + except: + print "An error occurred while reading state from " + os.path.abspath(self.filename) + "." + print "This usually means the file is corrupt, and removing it will fix the problem." + sys.exit(1) + + self.refcount = 1 + return self.object + + def _get_lock(self, no_block): + if not self.persister.use_locking: + return True + + self.lock_file = open(self.filename + ".lock", "w+") + try: mode = fcntl.LOCK_EX if no_block: mode |= fcntl.LOCK_NB - try: - fcntl.lockf(self.file.fileno(), mode) - except IOError, e: - if no_block and e.errno in (errno.EACCES, errno.EAGAIN): - return False - raise e - return True + fcntl.lockf(self.lock_file.fileno(), mode) + except IOError, e: + if no_block and e.errno in (errno.EACCES, errno.EAGAIN): + return False + raise e + return True + + def _open(self, no_block): + self.persister.log("Loading state file: ", self.filename) + + if not self._get_lock(no_block): + return None try: - self.file = open(self.filename, "r+") - if not get_lock(): - return None - self.object = pickle.load(self.file) - self.object.modified(False) + f = open(self.filename, "rb") except IOError: - self.file = open(self.filename, "w+") - if not get_lock(): - return None + # File can't be opened. + # Create a new object. self.object = self.klass() self.object.modified() - return self.object + return + + self.object = pickle.load(f) + self.object.modified(False) + f.close() + + def close(self): + """Reduce the reference count of the persisted object, saving + it back to its file if necessary.""" + + self.refcount -= 1 + if self.refcount > 0: + # Still in use. + return - def save(self): - """Save the persisted object back to the file if necessary.""" if self.object.is_modified(): + self.persister.log("Saving state file: ", self.filename) newname = "%s.new-%d" % (self.filename, os.getpid()) newfile = open(newname, "w") - try: - pickle.dump(self.object, newfile, pickle.HIGHEST_PROTOCOL) - except AttributeError: - # Python 2.2 doesn't have the protocol - # argument. - pickle.dump(self.object, newfile, True) + pickle.dump(self.object, newfile, pickle.HIGHEST_PROTOCOL) newfile.close() os.rename(newname, self.filename) - self.file.close() + if self.lock_file is not None: + self.lock_file.close() + self.persister._remove(self.filename) + +class Persister: + """Manage the collection of persisted files.""" + + def __init__(self, config): + self.files = {} + self.log = config.log + self.use_locking = config.locking + + def get(self, klass, filename): + """Get a context manager for a persisted file. + If the file is already open, this will return + the existing context manager.""" + + if filename in self.files: + return self.files[filename] + + p = Persisted(klass, filename, self) + self.files[filename] = p + return p + + def _rename(self, old_filename, new_filename): + self.files[new_filename] = self.files[old_filename] + del self.files[old_filename] + + def _remove(self, filename): + del self.files[filename] + + def delete(self, filename): + """Delete a persisted file, along with its lock file, + if they exist.""" + for ext in ("", ".lock"): + try: + os.unlink(filename + ext) + except OSError: + pass diff --git a/rawdoglib/plugins.py b/rawdoglib/plugins.py index 9dcb2e7..447b269 100644 --- a/rawdoglib/plugins.py +++ b/rawdoglib/plugins.py @@ -1,5 +1,5 @@ # plugins: handle add-on modules for rawdog. -# Copyright 2004, 2005 Adam Sampson <ats@offog.org> +# Copyright 2004, 2005, 2013, 2016 Adam Sampson <ats@offog.org> # # rawdog is free software; you can redistribute and/or modify it # under the terms of that license as published by the Free Software @@ -16,12 +16,17 @@ # Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA, or see http://www.gnu.org/. -import os, imp +# The design of rawdog's plugin API was inspired by Stuart Langridge's +# Vellum weblog system: +# http://www.kryogenix.org/code/vellum/ + +import imp +import os class Box: """Utility class that holds a mutable value. Useful for passing immutable types by reference.""" - def __init__(self, value = None): + def __init__(self, value=None): self.value = value plugin_count = 0 @@ -49,7 +54,7 @@ def load_plugins(dir, config): fn = os.path.join(dir, file) config.log("Loading plugin ", fn) f = open(fn, "r") - mod = imp.load_module("plugin%d" % (plugin_count,), f, fn, desc) + imp.load_module("plugin%d" % (plugin_count,), f, fn, desc) plugin_count += 1 f.close() @@ -70,4 +75,3 @@ def call_hook(hookname, *args): if not func(*args): return True return False - diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index afb90e6..06139dd 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -1,5 +1,5 @@ # rawdog: RSS aggregator without delusions of grandeur. -# Copyright 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Adam Sampson <ats@offog.org> +# Copyright 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012, 2013, 2014, 2015, 2016 Adam Sampson <ats@offog.org> # # rawdog is free software; you can redistribute and/or modify it # under the terms of that license as published by the Free Software @@ -16,26 +16,32 @@ # Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA, or see http://www.gnu.org/. -VERSION = "2.13" +VERSION = "2.22" +HTTP_AGENT = "rawdog/" + VERSION STATE_VERSION = 2 -import feedparser, plugins -from persister import Persistable, Persister -import os, time, getopt, sys, re, cgi, socket, urllib2, calendar -import string, locale -from StringIO import StringIO -import types - -try: - import threading - have_threading = 1 -except: - have_threading = 0 -try: - import hashlib -except: - hashlib = None - import sha +import rawdoglib.feedscanner +from rawdoglib.persister import Persistable, Persister +from rawdoglib.plugins import Box, call_hook, load_plugins + +from cStringIO import StringIO +import base64 +import calendar +import cgi +import feedparser +import getopt +import hashlib +import locale +import os +import re +import socket +import string +import sys +import threading +import time +import types +import urllib2 +import urlparse try: import tidylib @@ -47,26 +53,18 @@ try: except: mxtidy = None -try: - import feedfinder -except: - feedfinder = None +# Turn off content-cleaning, since we want to see an approximation to the +# original content for hashing. rawdog will sanitise HTML when writing. +feedparser.RESOLVE_RELATIVE_URIS = 0 +feedparser.SANITIZE_HTML = 0 -def new_sha1(s = ""): - """Return a new SHA1 hash object.""" - if hashlib is None: - return sha.new(s) - else: - return hashlib.sha1(s) +# Disable microformat support, because it tends to return poor-quality data +# (e.g. identifying inappropriate things as enclosures), and it relies on +# BeautifulSoup which is unable to parse many feeds. +feedparser.PARSE_MICROFORMATS = 0 -def set_socket_timeout(n): - """Set the system socket timeout.""" - if hasattr(socket, "setdefaulttimeout"): - socket.setdefaulttimeout(n) - else: - # Python 2.2 and earlier need to use an external module. - import timeoutsocket - timeoutsocket.setDefaultSocketTimeout(n) +# This is initialised in main(). +persister = None system_encoding = None def get_system_encoding(): @@ -76,12 +74,18 @@ def get_system_encoding(): def safe_ftime(format, t): """Format a time value into a string in the current locale (as time.strftime), but encode the result as ASCII HTML.""" - u = unicode(time.strftime(format, t), get_system_encoding()) + try: + u = unicode(time.strftime(format, t), get_system_encoding()) + except ValueError, e: + u = u"(bad time %s; %s)" % (repr(t), str(e)) return encode_references(u) def format_time(secs, config): """Format a time and date nicely.""" - t = time.localtime(secs) + try: + t = time.localtime(secs) + except ValueError, e: + return u"(bad time %s; %s)" % (repr(secs), str(e)) format = config["datetimeformat"] if format is None: format = config["timeformat"] + ", " + config["dayformat"] @@ -130,13 +134,17 @@ def sanitise_html(html, baseurl, inline, config): html = "<p>" + html if config["tidyhtml"]: - args = {"numeric_entities": 1, - "output_html": 1, - "output_xhtml": 0, - "output_xml": 0, - "wrap": 0} - plugins.call_hook("mxtidy_args", config, args, baseurl, inline) - plugins.call_hook("tidy_args", config, args, baseurl, inline) + args = { + "numeric_entities": 1, + "input_encoding": "ascii", + "output_encoding": "ascii", + "output_html": 1, + "output_xhtml": 0, + "output_xml": 0, + "wrap": 0, + } + call_hook("mxtidy_args", config, args, baseurl, inline) + call_hook("tidy_args", config, args, baseurl, inline) if tidylib is not None: # Disable PyTidyLib's somewhat unhelpful defaults. tidylib.BASE_OPTIONS = {} @@ -150,16 +158,18 @@ def sanitise_html(html, baseurl, inline, config): : output.rfind("</body>")].strip() html = html.decode("UTF-8") - box = plugins.Box(html) - plugins.call_hook("clean_html", config, box, baseurl, inline) + box = Box(html) + call_hook("clean_html", config, box, baseurl, inline) return box.value def select_detail(details): """Pick the preferred type of detail from a list of details. (If the argument isn't a list, treat it as a list of one.)""" - types = {"text/html": 30, - "application/xhtml+xml": 20, - "text/plain": 10} + TYPES = { + "text/html": 30, + "application/xhtml+xml": 20, + "text/plain": 10, + } if details is None: return None @@ -171,8 +181,8 @@ def select_detail(details): ctype = detail.get("type", None) if ctype is None: continue - if types.has_key(ctype): - score = types[ctype] + if TYPES.has_key(ctype): + score = TYPES[ctype] else: score = 0 if detail["value"] != "": @@ -184,7 +194,7 @@ def select_detail(details): else: return ds[-1][1] -def detail_to_html(details, inline, config, force_preformatted = False): +def detail_to_html(details, inline, config, force_preformatted=False): """Convert a detail hash or list of detail hashes as returned by feedparser into HTML.""" detail = select_detail(details) @@ -212,14 +222,14 @@ def author_to_html(entry, feedurl, config): url = None fallback = "author" if author_detail is not None: - if author_detail.has_key("url"): - url = author_detail["url"] + if author_detail.has_key("href"): + url = author_detail["href"] elif author_detail.has_key("email") and author_detail["email"] is not None: url = "mailto:" + author_detail["email"] if author_detail.has_key("email") and author_detail["email"] is not None: fallback = author_detail["email"] - elif author_detail.has_key("url") and author_detail["url"] is not None: - fallback = author_detail["url"] + elif author_detail.has_key("href") and author_detail["href"] is not None: + fallback = author_detail["href"] if name == "": name = fallback @@ -242,8 +252,8 @@ def fill_template(template, bits): including sections bracketed by __if_x__ .. [__else__ ..] __endif__ if bits["x"] is not "". If not bits.has_key("x"), __x__ expands to "".""" - result = plugins.Box() - plugins.call_hook("fill_template", template, bits, result) + result = Box() + call_hook("fill_template", template, bits, result) if result.value is not None: return result.value @@ -279,12 +289,24 @@ def fill_template(template, bits): file_cache = {} def load_file(name): - """Read the contents of a file, caching the result so we don't have to - read the file multiple times.""" + """Read the contents of a template file, caching the result so we don't + have to read the file multiple times. The file is assumed to be in the + system encoding; the result will be an ASCII string.""" if not file_cache.has_key(name): - f = open(name) - file_cache[name] = f.read() - f.close() + try: + f = open(name) + data = f.read() + f.close() + except IOError: + raise ConfigError("Can't read template file: " + name) + + try: + data = data.decode(get_system_encoding()) + except UnicodeDecodeError, e: + raise ConfigError("Character encoding problem in template file: " + name + ": " + str(e)) + + data = encode_references(data) + file_cache[name] = data.encode(get_system_encoding()) return file_cache[name] def write_ascii(f, s, config): @@ -299,7 +321,7 @@ def write_ascii(f, s, config): def short_hash(s): """Return a human-manipulatable 'short hash' of a string.""" - return new_sha1(s).hexdigest()[-8:] + return hashlib.sha1(s).hexdigest()[-8:] def ensure_unicode(value, encoding): """Convert a structure returned by feedparser into an equivalent where @@ -327,6 +349,81 @@ def ensure_unicode(value, encoding): else: return value +timeout_re = re.compile(r'timed? ?out', re.I) +def is_timeout_exception(exc): + """Return True if the given exception object suggests that a timeout + occurred, else return False.""" + + # Since urlopen throws away the original exception object, + # we have to look at the stringified form to tell if it was a timeout. + # (We're in reasonable company here, since test_ssl.py in the Python + # distribution does the same thing!) + # + # The message we're looking for is something like: + # Stock Python 2.7.7 and 2.7.8: + # <urlopen error _ssl.c:495: The handshake operation timed out> + # Debian python 2.7.3-4+deb7u1: + # <urlopen error _ssl.c:489: The handshake operation timed out> + # Debian python 2.7.8-1: + # <urlopen error ('_ssl.c:563: The handshake operation timed out',)> + return timeout_re.search(str(exc)) is not None + +class BasicAuthProcessor(urllib2.BaseHandler): + """urllib2 handler that does HTTP basic authentication + or proxy authentication with a fixed username and password. + (Unlike the classes to do this in urllib2, this doesn't wait + for a 401/407 response first.)""" + + def __init__(self, user, password, proxy=False): + self.auth = base64.b64encode(user + ":" + password) + if proxy: + self.header = "Proxy-Authorization" + else: + self.header = "Authorization" + + def http_request(self, req): + req.add_header(self.header, "Basic " + self.auth) + return req + + https_request = http_request + +class DisableIMProcessor(urllib2.BaseHandler): + """urllib2 handler that disables RFC 3229 for a request.""" + + def http_request(self, req): + # Request doesn't provide a method for removing headers -- + # so overwrite the header instead. + req.add_header("A-IM", "identity") + return req + + https_request = http_request + +class ResponseLogProcessor(urllib2.BaseHandler): + """urllib2 handler that maintains a log of HTTP responses.""" + + # Run after anything that's mangling headers (usually 500 or less), but + # before HTTPErrorProcessor (1000). + handler_order = 900 + + def __init__(self): + self.log = [] + + def http_response(self, req, response): + entry = { + "url": req.get_full_url(), + "status": response.getcode(), + } + location = response.info().get("Location") + if location is not None: + entry["location"] = location + self.log.append(entry) + return response + + https_response = http_response + + def get_log(self): + return self.log + non_alphanumeric_re = re.compile(r'<[^>]*>|\&[^\;]*\;|[^a-z0-9]') class Feed: """An RSS feed.""" @@ -341,12 +438,9 @@ class Feed: self.feed_info = {} def needs_update(self, now): - """Return 1 if it's time to update this feed, or 0 if its - update period has not yet elapsed.""" - if (now - self.last_update) < self.period: - return 0 - else: - return 1 + """Return True if it's time to update this feed, or False if + its update period has not yet elapsed.""" + return (now - self.last_update) >= self.period def get_state_filename(self): return "feeds/%s.state" % (short_hash(self.url),) @@ -356,100 +450,186 @@ class Feed: handlers = [] + logger = ResponseLogProcessor() + handlers.append(logger) + proxies = {} - for key, arg in self.args.items(): - if key.endswith("_proxy"): - proxies[key[:-6]] = arg + for name, value in self.args.items(): + if name.endswith("_proxy"): + proxies[name[:-6]] = value if len(proxies) != 0: handlers.append(urllib2.ProxyHandler(proxies)) if self.args.has_key("proxyuser") and self.args.has_key("proxypassword"): - mgr = DummyPasswordMgr((self.args["proxyuser"], self.args["proxypassword"])) - handlers.append(urllib2.ProxyBasicAuthHandler(mgr)) - - plugins.call_hook("add_urllib2_handlers", rawdog, config, self, handlers) + handlers.append(BasicAuthProcessor(self.args["proxyuser"], self.args["proxypassword"], proxy=True)) - auth_creds = None if self.args.has_key("user") and self.args.has_key("password"): - auth_creds = (self.args["user"], self.args["password"]) + handlers.append(BasicAuthProcessor(self.args["user"], self.args["password"])) - use_im = True if self.get_keepmin(config) == 0 or config["currentonly"]: - use_im = False + # If RFC 3229 and "A-IM: feed" is used, then there's + # no way to tell when an article has been removed. + # So if we only want to keep articles that are still + # being published by the feed, we have to turn it off. + handlers.append(DisableIMProcessor()) + + call_hook("add_urllib2_handlers", rawdog, config, self, handlers) + + url = self.url + # Turn plain filenames into file: URLs. (feedparser will open + # plain filenames itself, but we want it to open the file with + # urllib2 so we get a URLError if something goes wrong.) + if not ":" in url: + url = "file:" + url try: - return feedparser.parse(self.url, - etag = self.etag, - modified = self.modified, - agent = "rawdog/" + VERSION, - handlers = handlers, - auth_creds = auth_creds, - use_im = use_im) + result = feedparser.parse(url, + etag=self.etag, + modified=self.modified, + agent=HTTP_AGENT, + handlers=handlers) except Exception, e: - return { + result = { "rawdog_exception": e, "rawdog_traceback": sys.exc_info()[2], } + result["rawdog_responses"] = logger.get_log() + return result def update(self, rawdog, now, config, articles, p): """Add new articles from a feed to the collection. Returns True if any articles were read, False otherwise.""" - status = p.get("status") + # Note that feedparser might have thrown an exception -- + # so until we print the error message and return, we + # can't assume that p contains any particular field. + + responses = p.get("rawdog_responses") + if len(responses) > 0: + last_status = responses[-1]["status"] + elif len(p.get("feed", [])) != 0: + # Some protocol other than HTTP -- assume it's OK, + # since we got some content. + last_status = 200 + else: + # Timeout, or empty response from non-HTTP. + last_status = 0 + + version = p.get("version") + if version is None: + version = "" + self.last_update = now - error = None - non_fatal = False + errors = [] + fatal = False old_url = self.url + if "rawdog_exception" in p: - error = "Error fetching or parsing feed: %s" % str(p["rawdog_exception"]) + errors.append("Error fetching or parsing feed:") + errors.append(str(p["rawdog_exception"])) if config["showtracebacks"]: from traceback import format_tb - error += "\n" + "".join(format_tb(p["rawdog_traceback"])) - elif status is None and len(p["feed"]) == 0: - if config["ignoretimeouts"]: - return False + errors.append("".join(format_tb(p["rawdog_traceback"]))) + errors.append("") + fatal = True + + if len(responses) != 0 and responses[0]["status"] == 301: + # Permanent redirect(s). Find the new location. + i = 0 + while i < len(responses) and responses[i]["status"] == 301: + i += 1 + location = responses[i - 1].get("location") + + # According to RFC 2616, the Location header should be + # an absolute URI. This doesn't stop the occasional + # server sending something like "Location: /" or + # "Location: //foo/bar". It's usually a sign of + # brokenness, so fail rather than trying to interpret + # it liberally. + valid_uri = True + if location is not None: + parsed = urlparse.urlparse(location) + if parsed.scheme == "" or parsed.netloc == "": + valid_uri = False + + if not valid_uri: + errors.append("New URL: " + location) + errors.append("The feed returned a permanent redirect, but with an invalid new location.") + elif location is None: + errors.append("The feed returned a permanent redirect, but without a new location.") else: - error = "Timeout while reading feed." - elif status is None: - # Fetched by some protocol that doesn't have status. - pass - elif status == 301: - # Permanent redirect. The feed URL needs changing. - - error = "New URL: " + p["url"] + "\n" - error += "The feed has moved permanently to a new URL.\n" - if config["changeconfig"]: - rawdog.change_feed_url(self.url, p["url"], config) - error += "The config file has been updated automatically." + errors.append("New URL: " + location) + errors.append("The feed has moved permanently to a new URL.") + if config["changeconfig"]: + rawdog.change_feed_url(self.url, location, config) + errors.append("The config file has been updated automatically.") + else: + errors.append("You should update its entry in your config file.") + errors.append("") + + bozo_exception = p.get("bozo_exception") + got_urlerror = isinstance(bozo_exception, urllib2.URLError) + got_timeout = isinstance(bozo_exception, socket.timeout) + if got_urlerror or got_timeout: + # urllib2 reported an error when fetching the feed. + # Check to see if it was a timeout. + if not (got_timeout or is_timeout_exception(bozo_exception)): + errors.append("Error while fetching feed:") + errors.append(str(bozo_exception)) + errors.append("") + fatal = True + elif config["ignoretimeouts"]: + return False else: - error += "You should update its entry in your config file." - non_fatal = True - elif status in [403, 410]: - # The feed is disallowed or gone. The feed should be unsubscribed. - error = "The feed has gone.\n" - error += "You should remove it from your config file." - elif status / 100 in [4, 5]: - # Some sort of client or server error. The feed may need unsubscribing. - error = "The feed returned an error.\n" - error += "If this condition persists, you should remove it from your config file." - - plugins.call_hook("feed_fetched", rawdog, config, self, p, error, non_fatal) - - if error is not None: + errors.append("Timeout while reading feed.") + errors.append("") + fatal = True + elif last_status == 304: + # The feed hasn't changed. Return False to indicate + # that we shouldn't do expiry. + return False + elif last_status in [403, 410]: + # The feed is disallowed or gone. The feed should be + # unsubscribed. + errors.append("The feed has gone.") + errors.append("You should remove it from your config file.") + errors.append("") + fatal = True + elif last_status / 100 != 2: + # Some sort of client or server error. The feed may + # need unsubscribing. + errors.append("The feed returned an error.") + errors.append("If this condition persists, you should remove it from your config file.") + errors.append("") + fatal = True + elif version == "" and len(p.get("entries", [])) == 0: + # feedparser couldn't detect the type of this feed or + # retrieve any entries from it. + errors.append("The data retrieved from this URL could not be understood as a feed.") + errors.append("You should check whether the feed has changed URLs or been removed.") + errors.append("") + fatal = True + + old_error = "\n".join(errors) + call_hook("feed_fetched", rawdog, config, self, p, old_error, not fatal) + + if len(errors) != 0: print >>sys.stderr, "Feed: " + old_url - if status is not None: - print >>sys.stderr, "HTTP Status: " + str(status) - print >>sys.stderr, error - print >>sys.stderr - if not non_fatal: + if last_status != 0: + print >>sys.stderr, "HTTP Status: " + str(last_status) + for line in errors: + print >>sys.stderr, line + if fatal: return False + # From here, we can assume that we've got a complete feedparser + # response. + p = ensure_unicode(p, p.get("encoding") or "UTF-8") - # In the event that the feed hasn't changed, then both channel - # and feed will be empty. In this case we return 0 so that - # we know not to expire articles that came from this feed. + # No entries means the feed hasn't changed, but for some reason + # we didn't get a 304 response. Handle it the same way. if len(p["entries"]) == 0: return False @@ -467,15 +647,15 @@ class Feed: if a.feed == feed and id is not None: article_ids[id] = a - seen = {} + seen_articles = set() sequence = 0 for entry_info in p["entries"]: article = Article(feed, entry_info, now, sequence) - ignore = plugins.Box(False) - plugins.call_hook("article_seen", rawdog, config, article, ignore) + ignore = Box(False) + call_hook("article_seen", rawdog, config, article, ignore) if ignore.value: continue - seen[article.hash] = True + seen_articles.add(article.hash) sequence += 1 id = entry_info.get("id") @@ -488,14 +668,14 @@ class Feed: if existing_article is not None: existing_article.update_from(article, now) - plugins.call_hook("article_updated", rawdog, config, existing_article, now) + call_hook("article_updated", rawdog, config, existing_article, now) else: articles[article.hash] = article - plugins.call_hook("article_added", rawdog, config, article, now) + call_hook("article_added", rawdog, config, article, now) if config["currentonly"]: for (hash, a) in articles.items(): - if a.feed == feed and not seen.has_key(hash): + if a.feed == feed and hash not in seen_articles: del articles[hash] return True @@ -526,10 +706,7 @@ class Feed: return non_alphanumeric_re.sub('', r) def get_keepmin(self, config): - try: - return int(self.args["keepmin"]) - except: - return config["keepmin"] + return self.args.get("keepmin", config["keepmin"]) class Article: """An article retrieved from an RSS feed.""" @@ -539,14 +716,15 @@ class Article: self.entry_info = entry_info self.sequence = sequence - # HOTFIX for missing date sorting. - # TODO: Why is this needed? - #modified = entry_info.get("modified_parsed") - modified = entry_info.get("updated_parsed") or entry_info.get("published_parsed") self.date = None - if modified is not None: + parsed = entry_info.get("updated_parsed") + if parsed is None: + parsed = entry_info.get("published_parsed") + if parsed is None: + parsed = entry_info.get("created_parsed") + if parsed is not None: try: - self.date = calendar.timegm(modified) + self.date = calendar.timegm(parsed) except OverflowError: pass @@ -561,21 +739,21 @@ class Article: system (i.e. it can't just be the article ID, because that would collide if more than one feed included the same article).""" - h = new_sha1() + h = hashlib.sha1() def add_hash(s): h.update(s.encode("UTF-8")) add_hash(self.feed) entry_info = self.entry_info - if entry_info.has_key("title_raw"): - add_hash(entry_info["title_raw"]) + if entry_info.has_key("title"): + add_hash(entry_info["title"]) if entry_info.has_key("link"): add_hash(entry_info["link"]) if entry_info.has_key("content"): for content in entry_info["content"]: - add_hash(content["value_raw"]) + add_hash(content["value"]) if entry_info.has_key("summary_detail"): - add_hash(entry_info["summary_detail"]["value_raw"]) + add_hash(entry_info["summary_detail"]["value"]) return h.hexdigest() @@ -588,7 +766,7 @@ class Article: self.last_seen = now def can_expire(self, now, config): - return ((now - self.last_seen) > config["expireage"]) + return (now - self.last_seen) > config["expireage"] def get_sort_date(self, config): if config["sortbyfeeddate"]: @@ -600,7 +778,7 @@ class DayWriter: """Utility class for writing day sections into a series of articles.""" def __init__(self, file, config): - self.lasttime = [-1, -1, -1, -1, -1] + self.lasttime = [] self.file = file self.counter = 0 self.config = config @@ -618,7 +796,11 @@ class DayWriter: self.counter += 1 def time(self, s): - tm = time.localtime(s) + try: + tm = time.localtime(s) + except ValueError: + # e.g. "timestamp out of range for platform time_t" + return if tm[:3] != self.lasttime[:3] and self.config["daysections"]: self.close(0) self.start_day(tm) @@ -630,17 +812,23 @@ class DayWriter: self.start_time(tm) self.lasttime = tm - def close(self, n = 0): + def close(self, n=0): while self.counter > n: print >>self.file, "</div>" self.counter -= 1 -def parse_time(value, default = "m"): +def parse_time(value, default="m"): """Parse a time period with optional units (s, m, h, d, w) into a time in seconds. If no unit is specified, use minutes by default; specify the default argument to change this. Raises ValueError if the format isn't recognised.""" - units = { "s" : 1, "m" : 60, "h" : 3600, "d" : 86400, "w" : 604800 } + units = { + "s": 1, + "m": 60, + "h": 3600, + "d": 86400, + "w": 604800, + } for unit, size in units.items(): if value.endswith(unit): return int(value[:-len(unit)]) * size @@ -651,9 +839,9 @@ def parse_bool(value): the value isn't recognised.""" value = value.strip().lower() if value == "0" or value == "false": - return 0 + return False elif value == "1" or value == "true": - return 1 + return True else: raise ValueError("Bad boolean value: " + value) @@ -662,7 +850,8 @@ def parse_list(value): return value.strip().split(None) def parse_feed_args(argparams, arglines): - """Parse a list of feed arguments. Raise ConfigError if the syntax is invalid.""" + """Parse a list of feed arguments. Raise ConfigError if the syntax is + invalid, or ValueError if an argument value can't be parsed.""" args = {} for p in argparams: ps = p.split("=", 1) @@ -674,23 +863,36 @@ def parse_feed_args(argparams, arglines): if len(ps) != 2: raise ConfigError("Bad argument line in config: " + p) args[ps[0]] = ps[1] - if "maxage" in args: - args["maxage"] = parse_time(args["maxage"]) + for name, value in args.items(): + if name == "allowduplicates": + args[name] = parse_bool(value) + elif name == "keepmin": + args[name] = int(value) + elif name == "maxage": + args[name] = parse_time(value) return args -class ConfigError(Exception): pass +class ConfigError(Exception): + pass class Config: """The aggregator's configuration.""" - def __init__(self, locking): + def __init__(self, locking=True, logfile_name=None): self.locking = locking self.files_loaded = [] - if have_threading: - self.loglock = threading.Lock() + self.loglock = threading.Lock() + self.logfile = None + if logfile_name: + self.logfile = open(logfile_name, "a") self.reset() def reset(self): + # Note that these default values are *not* the same as + # in the supplied config file. The idea is that someone + # who has an old config file shouldn't notice a difference + # in behaviour on upgrade -- so new options generally + # default to False here, and True in the sample file. self.config = { "feedslist" : [], "feeddefaults" : {}, @@ -703,30 +905,38 @@ class Config: "dayformat" : "%A, %d %B %Y", "timeformat" : "%I:%M %p", "datetimeformat" : None, - "userefresh" : 0, - "showfeeds" : 1, + "userefresh" : False, + "showfeeds" : True, "timeout" : 30, - "template" : "default", + "pagetemplate" : "default", "itemtemplate" : "default", - "verbose" : 0, - "ignoretimeouts" : 0, - "showtracebacks" : 0, - "daysections" : 1, - "timesections" : 1, - "blocklevelhtml" : 1, - "tidyhtml" : 0, - "sortbyfeeddate" : 0, - "currentonly" : 0, - "hideduplicates" : "", + "feedlisttemplate" : "default", + "feeditemtemplate" : "default", + "verbose" : False, + "ignoretimeouts" : False, + "showtracebacks" : False, + "daysections" : True, + "timesections" : True, + "blocklevelhtml" : True, + "tidyhtml" : False, + "sortbyfeeddate" : False, + "currentonly" : False, + "hideduplicates" : [], "newfeedperiod" : "3h", - "changeconfig": 0, - "numthreads": 0, - "splitstate": 0, - "useids": 0, + "changeconfig": False, + "numthreads": 1, + "splitstate": False, + "useids": False, } - def __getitem__(self, key): return self.config[key] - def __setitem__(self, key, value): self.config[key] = value + def __getitem__(self, key): + return self.config[key] + + def get(self, key, default=None): + return self.config.get(key, default) + + def __setitem__(self, key, value): + self.config[key] = value def reload(self): self.log("Reloading config files") @@ -734,7 +944,7 @@ class Config: for filename in self.files_loaded: self.load(filename, False) - def load(self, filename, explicitly_loaded = True): + def load(self, filename, explicitly_loaded=True): """Load configuration from a config file.""" if explicitly_loaded: self.files_loaded.append(filename) @@ -743,7 +953,12 @@ class Config: try: f = open(filename, "r") for line in f.xreadlines(): - stripped = line.decode(get_system_encoding()).strip() + try: + line = line.decode(get_system_encoding()) + except UnicodeDecodeError, e: + raise ConfigError("Character encoding problem in config file: " + filename + ": " + str(e)) + + stripped = line.strip() if stripped == "" or stripped[0] == "#": continue if line[0] in string.whitespace: @@ -771,6 +986,11 @@ class Config: elif len(l) != 2: raise ConfigError("Bad line in config: " + line) + # Load template files immediately, so we produce an error now + # rather than later if anything goes wrong. + if l[0].endswith("template") and l[1] != "default": + load_file(l[1]) + handled_arglines = False if l[0] == "feed": l = l[1].split(None) @@ -788,7 +1008,7 @@ class Config: self["defines"][l[0]] = l[1] elif l[0] == "plugindirs": for dir in parse_list(l[1]): - plugins.load_plugins(dir, self) + load_plugins(dir, self) elif l[0] == "outputfile": self["outputfile"] = l[1] elif l[0] == "maxarticles": @@ -811,10 +1031,14 @@ class Config: self["showfeeds"] = parse_bool(l[1]) elif l[0] == "timeout": self["timeout"] = parse_time(l[1], "s") - elif l[0] == "template": - self["template"] = l[1] + elif l[0] in ("template", "pagetemplate"): + self["pagetemplate"] = l[1] elif l[0] == "itemtemplate": self["itemtemplate"] = l[1] + elif l[0] == "feedlisttemplate": + self["feedlisttemplate"] = l[1] + elif l[0] == "feeditemtemplate": + self["feeditemtemplate"] = l[1] elif l[0] == "verbose": self["verbose"] = parse_bool(l[1]) elif l[0] == "ignoretimeouts": @@ -847,9 +1071,9 @@ class Config: self["useids"] = parse_bool(l[1]) elif l[0] == "include": self.load(l[1], False) - elif plugins.call_hook("config_option_arglines", self, l[0], l[1], arglines): + elif call_hook("config_option_arglines", self, l[0], l[1], arglines): handled_arglines = True - elif plugins.call_hook("config_option", self, l[0], l[1]): + elif call_hook("config_option", self, l[0], l[1]): pass else: raise ConfigError("Unknown config command: " + l[0]) @@ -858,13 +1082,16 @@ class Config: raise ConfigError("Bad argument lines in config after: " + line) def log(self, *args): - """If running in verbose mode, print a status message.""" + """Print a status message. If running in verbose mode, write + the message to stderr; if using a logfile, write it to the + logfile.""" if self["verbose"]: - if have_threading: - self.loglock.acquire() - print >>sys.stderr, "".join(map(str, args)) - if have_threading: - self.loglock.release() + with self.loglock: + print >>sys.stderr, "".join(map(str, args)) + if self.logfile is not None: + with self.loglock: + print >>self.logfile, "".join(map(str, args)) + self.logfile.flush() def bug(self, *args): """Report detection of a bug in rawdog.""" @@ -897,20 +1124,19 @@ class AddFeedEditor: def add_feed(filename, url, rawdog, config): """Try to add a feed to the config file.""" - if feedfinder is None: - feeds = [url] - else: - feeds = feedfinder.feeds(url) + feeds = rawdoglib.feedscanner.feeds(url) if feeds == []: print >>sys.stderr, "Cannot find any feeds in " + url - else: - feed = feeds[0] - if feed in rawdog.feeds: - print >>sys.stderr, "Feed " + feed + " is already in the config file" - else: - print >>sys.stderr, "Adding feed " + feed - feedline = "feed %s %s\n" % (config["newfeedperiod"], feed) - edit_file(filename, AddFeedEditor(feedline).edit) + return + + feed = feeds[0] + if feed in rawdog.feeds: + print >>sys.stderr, "Feed " + feed + " is already in the config file" + return + + print >>sys.stderr, "Adding feed " + feed + feedline = "feed %s %s\n" % (config["newfeedperiod"], feed) + edit_file(filename, AddFeedEditor(feedline).edit) class ChangeFeedEditor: def __init__(self, oldurl, newurl): @@ -927,13 +1153,13 @@ class RemoveFeedEditor: def __init__(self, url): self.url = url def edit(self, inputfile, outputfile): - while 1: + while True: l = inputfile.readline() if l == "": break ls = l.strip().split(None) if len(ls) > 2 and ls[0] == "feed" and ls[2] == self.url: - while 1: + while True: l = inputfile.readline() if l == "": break @@ -960,63 +1186,58 @@ class FeedFetcher: self.rawdog = rawdog self.config = config self.lock = threading.Lock() - self.jobs = {} - for feed in feedlist: - self.jobs[feed] = 1 + self.jobs = set(feedlist) self.results = {} def worker(self, num): rawdog = self.rawdog config = self.config - config.log("Thread ", num, " starting") - while 1: - self.lock.acquire() - if self.jobs == {}: - job = None - else: - job = self.jobs.keys()[0] - del self.jobs[job] - self.lock.release() - if job is None: - break + while True: + with self.lock: + try: + job = self.jobs.pop() + except KeyError: + # No jobs left. + break - config.log("Thread ", num, " fetching feed: ", job) + config.log("[", num, "] Fetching feed: ", job) feed = rawdog.feeds[job] - plugins.call_hook("pre_update_feed", rawdog, config, feed) - self.results[job] = feed.fetch(rawdog, config) - config.log("Thread ", num, " done") + call_hook("pre_update_feed", rawdog, config, feed) + result = feed.fetch(rawdog, config) - def run(self, numworkers): - self.config.log("Thread farm starting with ", len(self.jobs), " jobs") - workers = [] - for i in range(numworkers): - self.lock.acquire() - isempty = (self.jobs == {}) - self.lock.release() - if isempty: - # No jobs left in the queue -- don't bother - # starting any more workers. - break + with self.lock: + self.results[job] = result + + def run(self, max_workers): + max_workers = max(max_workers, 1) + num_workers = min(max_workers, len(self.jobs)) - t = threading.Thread(target = self.worker, args = (i,)) + self.config.log("Fetching ", len(self.jobs), " feeds using ", + num_workers, " threads") + workers = [] + for i in range(1, num_workers): + t = threading.Thread(target=self.worker, args=(i,)) t.start() workers.append(t) + self.worker(0) for worker in workers: worker.join() - self.config.log("Thread farm finished with ", len(self.results), " results") + self.config.log("Fetch complete") return self.results class FeedState(Persistable): """The collection of articles in a feed.""" def __init__(self): + Persistable.__init__(self) self.articles = {} class Rawdog(Persistable): """The aggregator itself.""" def __init__(self): + Persistable.__init__(self) self.feeds = {} self.articles = {} self.plugin_storage = {} @@ -1053,18 +1274,20 @@ class Rawdog(Persistable): edit_file("config", ChangeFeedEditor(oldurl, newurl).edit) feed = self.feeds[oldurl] + # Changing the URL will change the state filename as well, + # so we need to save the old name to load from. old_state = feed.get_state_filename() feed.url = newurl del self.feeds[oldurl] self.feeds[newurl] = feed if config["splitstate"]: - persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config) - for article in feedstate.articles.values(): - article.feed = newurl - feedstate.modified() - save_persisted(persister, config) - os.rename(old_state, feed.get_state_filename()) + feedstate_p = persister.get(FeedState, old_state) + feedstate_p.rename(feed.get_state_filename()) + with feedstate_p as feedstate: + for article in feedstate.articles.values(): + article.feed = newurl + feedstate.modified() else: for article in self.articles.values(): if article.feed == oldurl: @@ -1086,9 +1309,18 @@ class Rawdog(Persistable): """Update rawdog's internal state to match the configuration.""" + # Make sure the splitstate directory exists. + if config["splitstate"]: + try: + os.mkdir("feeds") + except OSError: + # Most likely it already exists. + pass + + # Convert to or from splitstate if necessary. try: u = self.using_splitstate - except: + except AttributeError: # We were last run with a version of rawdog that didn't # have this variable -- so we must have a single state # file. @@ -1099,31 +1331,29 @@ class Rawdog(Persistable): if config["splitstate"]: config.log("Converting to split state files") for feed_hash, feed in self.feeds.items(): - persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config) - feedstate.articles = {} - for article_hash, article in self.articles.items(): - if article.feed == feed_hash: - feedstate.articles[article_hash] = article - feedstate.modified() - save_persisted(persister, config) + with persister.get(FeedState, feed.get_state_filename()) as feedstate: + feedstate.articles = {} + for article_hash, article in self.articles.items(): + if article.feed == feed_hash: + feedstate.articles[article_hash] = article + feedstate.modified() self.articles = {} else: config.log("Converting to single state file") self.articles = {} for feed_hash, feed in self.feeds.items(): - persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config) - for article_hash, article in feedstate.articles.items(): - self.articles[article_hash] = article - feedstate.articles = {} - feedstate.modified() - save_persisted(persister, config) - os.unlink(feed.get_state_filename()) + with persister.get(FeedState, feed.get_state_filename()) as feedstate: + for article_hash, article in feedstate.articles.items(): + self.articles[article_hash] = article + feedstate.articles = {} + feedstate.modified() + persister.delete(feed.get_state_filename()) self.modified() self.using_splitstate = config["splitstate"] - seenfeeds = {} + seen_feeds = set() for (url, period, args) in config["feedslist"]: - seenfeeds[url] = 1 + seen_feeds.add(url) if not self.feeds.has_key(url): config.log("Adding new feed: ", url) self.feeds[url] = Feed(url) @@ -1141,13 +1371,10 @@ class Rawdog(Persistable): feed.args = newargs self.modified() for url in self.feeds.keys(): - if not seenfeeds.has_key(url): + if url not in seen_feeds: config.log("Removing feed: ", url) if config["splitstate"]: - try: - os.unlink(self.feeds[url].get_state_filename()) - except OSError: - pass + persister.delete(self.feeds[url].get_state_filename()) else: for key, article in self.articles.items(): if article.feed == url: @@ -1155,16 +1382,13 @@ class Rawdog(Persistable): del self.feeds[url] self.modified() - def update(self, config, feedurl = None): + def update(self, config, feedurl=None): """Perform the update action: check feeds for new articles, and expire old ones.""" config.log("Starting update") now = time.time() - feedparser._FeedParserMixin.can_contain_relative_uris = ["url"] - feedparser._FeedParserMixin.can_contain_dangerous_markup = [] - feedparser.BeautifulSoup = None - set_socket_timeout(config["timeout"]) + socket.setdefaulttimeout(config["timeout"]) if feedurl is None: update_feeds = [url for url in self.feeds.keys() @@ -1180,14 +1404,14 @@ class Rawdog(Persistable): numfeeds = len(update_feeds) config.log("Will update ", numfeeds, " feeds") - if have_threading and config["numthreads"] > 0: - fetcher = FeedFetcher(self, update_feeds, config) - prefetched = fetcher.run(config["numthreads"]) - else: - prefetched = {} + fetcher = FeedFetcher(self, update_feeds, config) + fetched = fetcher.run(config["numthreads"]) - seen_some_items = {} + seen_some_items = set() def do_expiry(articles): + """Expire articles from a list. Return True if any + articles were expired.""" + feedcounts = {} for key, article in articles.items(): url = article.feed @@ -1209,45 +1433,45 @@ class Rawdog(Persistable): count += 1 del articles[key] continue - if (seen_some_items.has_key(url) + if (url in seen_some_items and self.feeds.has_key(url) and article.can_expire(now, config) and feedcounts[url] > self.feeds[url].get_keepmin(config)): - plugins.call_hook("article_expired", self, config, article, now) + call_hook("article_expired", self, config, article, now) count += 1 feedcounts[url] -= 1 del articles[key] config.log("Expired ", count, " articles, leaving ", len(articles)) + return count > 0 + count = 0 for url in update_feeds: count += 1 - config.log("Updating feed ", count, " of " , numfeeds, ": ", url) + config.log("Updating feed ", count, " of ", numfeeds, ": ", url) feed = self.feeds[url] if config["splitstate"]: - persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config) + feedstate_p = persister.get(FeedState, feed.get_state_filename()) + feedstate = feedstate_p.open() articles = feedstate.articles else: articles = self.articles - if url in prefetched: - content = prefetched[url] - else: - plugins.call_hook("pre_update_feed", self, config, feed) - content = feed.fetch(self, config) - plugins.call_hook("mid_update_feed", self, config, feed, content) + content = fetched[url] + call_hook("mid_update_feed", self, config, feed, content) rc = feed.update(self, now, config, articles, content) url = feed.url - plugins.call_hook("post_update_feed", self, config, feed, rc) + call_hook("post_update_feed", self, config, feed, rc) if rc: - seen_some_items[url] = 1 + seen_some_items.add(url) if config["splitstate"]: feedstate.modified() if config["splitstate"]: - do_expiry(articles) - save_persisted(persister, config) + if do_expiry(articles): + feedstate.modified() + feedstate_p.close() if config["splitstate"]: self.articles = {} @@ -1257,22 +1481,25 @@ class Rawdog(Persistable): self.modified() config.log("Finished update") - def get_template(self, config): - """Get the main template.""" - if config["template"] != "default": - return load_file(config["template"]) + def get_template(self, config, name="page"): + """Return the contents of a template.""" + + filename = config.get(name + "template", "default") + if filename != "default": + return load_file(filename) - template = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + if name == "page": + template = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> <meta name="robots" content="noindex,nofollow,noarchive"> """ - if config["userefresh"]: - template += """__refresh__ + if config["userefresh"]: + template += """__refresh__ """ - template += """ <link rel="stylesheet" href="style.css" type="text/css"> + template += """ <link rel="stylesheet" href="style.css" type="text/css"> <title>rawdog</title> </head> <body id="rawdog"> @@ -1283,13 +1510,13 @@ class Rawdog(Persistable): __items__ </div> """ - if config["showfeeds"]: - template += """<h2 id="feedstatsheader">Feeds</h2> + if config["showfeeds"]: + template += """<h2 id="feedstatsheader">Feeds</h2> <div id="feedstats"> __feeds__ </div> """ - template += """<div id="footer"> + template += """<div id="footer"> <p id="aboutrawdog">Generated by <a href="http://offog.org/code/rawdog.html">rawdog</a> version __version__ @@ -1298,14 +1525,9 @@ by <a href="mailto:ats@offog.org">Adam Sampson</a>.</p> </body> </html> """ - return template - - def get_itemtemplate(self, config): - """Get the item template.""" - if config["itemtemplate"] != "default": - return load_file(config["itemtemplate"]) - - template = """<div class="item feed-__feed_hash__ feed-__feed_id__" id="item-__hash__"> + return template + elif name == "item": + return """<div class="item feed-__feed_hash__ feed-__feed_id__" id="item-__hash__"> <p class="itemheader"> <span class="itemtitle">__title__</span> <span class="itemfrom">[__feed_title__]</span> @@ -1316,20 +1538,36 @@ __description__ </div> """ - return template - - def show_template(self, config): - """Show the configured main template.""" - print self.get_template(config) + elif name == "feedlist": + return """<table id="feeds"> +<tr id="feedsheader"> +<th>Feed</th><th>RSS</th><th>Last fetched</th><th>Next fetched after</th> +</tr> +__feeditems__ +</table> +""" + elif name == "feeditem": + return """ +<tr class="feedsrow"> +<td>__feed_title__</td> +<td>__feed_icon__</td> +<td>__feed_last_update__</td> +<td>__feed_next_update__</td> +</tr> +""" + else: + raise KeyError("Unknown template name: " + name) - def show_itemtemplate(self, config): - """Show the configured item template.""" - print self.get_itemtemplate(config) + def show_template(self, name, config): + """Show the contents of a template, as currently configured.""" + try: + print self.get_template(config, name), + except KeyError: + print >>sys.stderr, "Unknown template name: " + name def write_article(self, f, article, config): """Write an article to the given file.""" feed = self.feeds[article.feed] - feed_info = feed.feed_info entry_info = article.entry_info link = entry_info.get("link") @@ -1340,7 +1578,7 @@ __description__ if guid == "": guid = None - itembits = {} + itembits = self.get_feed_bits(config, feed) for name, value in feed.args.items(): if name.startswith("define_"): itembits[name[7:]] = sanitise_html(value, "", True, config) @@ -1355,7 +1593,7 @@ __description__ if key is None: description = None else: - force_preformatted = feed.args.has_key("format") and (feed.args["format"] == "text") + force_preformatted = (feed.args.get("format", "default") == "text") description = detail_to_html(entry_info[key], False, config, force_preformatted) date = article.date @@ -1379,11 +1617,6 @@ __description__ else: itembits["title"] = '<a href="' + string_to_html(link, config) + '">' + title + '</a>' - itembits["feed_title_no_link"] = detail_to_html(feed_info.get("title_detail"), True, config) - itembits["feed_title"] = feed.get_html_link(config) - itembits["feed_url"] = string_to_html(feed.url, config) - itembits["feed_hash"] = short_hash(feed.url) - itembits["feed_id"] = feed.get_id(config) itembits["hash"] = short_hash(article.hash) if description is not None: @@ -1403,24 +1636,22 @@ __description__ else: itembits["date"] = "" - plugins.call_hook("output_item_bits", self, config, feed, article, itembits) - itemtemplate = self.get_itemtemplate(config) + call_hook("output_item_bits", self, config, feed, article, itembits) + itemtemplate = self.get_template(config, "item") f.write(fill_template(itemtemplate, itembits)) def write_remove_dups(self, articles, config, now): """Filter the list of articles to remove articles that are too old or are duplicates.""" kept_articles = [] - seen_links = {} - seen_guids = {} + seen_links = set() + seen_guids = set() dup_count = 0 for article in articles: feed = self.feeds[article.feed] age = now - article.added - maxage = config["maxage"] - if "maxage" in feed.args: - maxage = feed.args["maxage"] + maxage = feed.args.get("maxage", config["maxage"]) if maxage != 0 and age > maxage: continue @@ -1434,19 +1665,17 @@ __description__ if guid == "": guid = None - if feed.args.get("allowduplicates") != "true": + if not feed.args.get("allowduplicates", False): is_dup = False for key in config["hideduplicates"]: if key == "id" and guid is not None: - if seen_guids.has_key(guid): + if guid in seen_guids: is_dup = True - seen_guids[guid] = 1 - break + seen_guids.add(guid) elif key == "link" and link is not None: - if seen_links.has_key(link): + if link in seen_links: is_dup = True - seen_links[link] = 1 - break + seen_links.add(link) if is_dup: dup_count += 1 continue @@ -1454,37 +1683,56 @@ __description__ kept_articles.append(article) return (kept_articles, dup_count) + def get_feed_bits(self, config, feed): + """Get the bits that are used to describe a feed.""" + + bits = {} + bits["feed_id"] = feed.get_id(config) + bits["feed_hash"] = short_hash(feed.url) + bits["feed_title"] = feed.get_html_link(config) + bits["feed_title_no_link"] = detail_to_html(feed.feed_info.get("title_detail"), True, config) + bits["feed_url"] = string_to_html(feed.url, config) + bits["feed_icon"] = '<a class="xmlbutton" href="' + cgi.escape(feed.url) + '">XML</a>' + bits["feed_last_update"] = format_time(feed.last_update, config) + bits["feed_next_update"] = format_time(feed.last_update + feed.period, config) + return bits + + def write_feeditem(self, f, feed, config): + """Write a feed list item.""" + bits = self.get_feed_bits(config, feed) + f.write(fill_template(self.get_template(config, "feeditem"), bits)) + + def write_feedlist(self, f, config): + """Write the feed list.""" + bits = {} + + feeds = [(feed.get_html_name(config).lower(), feed) + for feed in self.feeds.values()] + feeds.sort() + + feeditems = StringIO() + for key, feed in feeds: + self.write_feeditem(feeditems, feed, config) + bits["feeditems"] = feeditems.getvalue() + feeditems.close() + + f.write(fill_template(self.get_template(config, "feedlist"), bits)) + def get_main_template_bits(self, config): """Get the bits that are used in the default main template, with the exception of items and num_items.""" - bits = { "version" : VERSION } + bits = {"version": VERSION} bits.update(config["defines"]) - refresh = config["expireage"] - for feed in self.feeds.values(): - if feed.period < refresh: refresh = feed.period - - bits["refresh"] = """<meta http-equiv="Refresh" """ + 'content="' + str(refresh) + '"' + """>""" + refresh = min([config["expireage"]] + + [feed.period for feed in self.feeds.values()]) + bits["refresh"] = '<meta http-equiv="Refresh" content="' + str(refresh) + '">' f = StringIO() - print >>f, """<table id="feeds"> -<tr id="feedsheader"> -<th>Feed</th><th>RSS</th><th>Last fetched</th><th>Next fetched after</th> -</tr>""" - feeds = [(feed.get_html_name(config).lower(), feed) - for feed in self.feeds.values()] - feeds.sort() - for (key, feed) in feeds: - print >>f, '<tr class="feedsrow">' - print >>f, '<td>' + feed.get_html_link(config) + '</td>' - print >>f, '<td><a class="xmlbutton" href="' + cgi.escape(feed.url) + '">XML</a></td>' - print >>f, '<td>' + format_time(feed.last_update, config) + '</td>' - print >>f, '<td>' + format_time(feed.last_update + feed.period, config) + '</td>' - print >>f, '</tr>' - print >>f, """</table>""" + self.write_feedlist(f, config) bits["feeds"] = f.getvalue() f.close() - bits["num_feeds"] = str(len(feeds)) + bits["num_feeds"] = str(len(self.feeds)) return bits @@ -1492,23 +1740,23 @@ __description__ """Write a regular rawdog HTML output file.""" f = StringIO() dw = DayWriter(f, config) - plugins.call_hook("output_items_begin", self, config, f) + call_hook("output_items_begin", self, config, f) for article in articles: - if not plugins.call_hook("output_items_heading", self, config, f, article, article_dates[article]): + if not call_hook("output_items_heading", self, config, f, article, article_dates[article]): dw.time(article_dates[article]) self.write_article(f, article, config) dw.close() - plugins.call_hook("output_items_end", self, config, f) + call_hook("output_items_end", self, config, f) bits = self.get_main_template_bits(config) bits["items"] = f.getvalue() f.close() bits["num_items"] = str(len(articles)) - plugins.call_hook("output_bits", self, config, bits) - s = fill_template(self.get_template(config), bits) + call_hook("output_bits", self, config, bits) + s = fill_template(self.get_template(config, "page"), bits) outputfile = config["outputfile"] if outputfile == "-": write_ascii(sys.stdout, s, config) @@ -1530,14 +1778,13 @@ __description__ if config["splitstate"]: article_list = [] for feed in self.feeds.values(): - persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config) - article_list += list_articles(feedstate.articles) - save_persisted(persister, config) + with persister.get(FeedState, feed.get_state_filename()) as feedstate: + article_list += list_articles(feedstate.articles) else: article_list = list_articles(self.articles) numarticles = len(article_list) - if not plugins.call_hook("output_sort_articles", self, config, article_list): + if not call_hook("output_sort_articles", self, config, article_list): article_list.sort() if config["maxarticles"] != 0: @@ -1558,10 +1805,9 @@ __description__ found = {} for (feed_url, article_hashes) in wanted.items(): feed = self.feeds[feed_url] - persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config) - for hash in article_hashes: - found[hash] = feedstate.articles[hash] - save_persisted(persister, config) + with persister.get(FeedState, feed.get_state_filename()) as feedstate: + for hash in article_hashes: + found[hash] = feedstate.articles[hash] else: found = self.articles @@ -1573,16 +1819,16 @@ __description__ articles.append(a) article_dates[a] = -date - plugins.call_hook("output_write", self, config, articles) + call_hook("output_write", self, config, articles) - if not plugins.call_hook("output_sorted_filter", self, config, articles): + if not call_hook("output_sorted_filter", self, config, articles): (articles, dup_count) = self.write_remove_dups(articles, config, now) else: dup_count = 0 config.log("Selected ", len(articles), " of ", numarticles, " articles to write; ignored ", dup_count, " duplicates") - if not plugins.call_hook("output_write_files", self, config, articles, article_dates): + if not call_hook("output_write_files", self, config, articles, article_dates): self.write_output_file(articles, article_dates, config) config.log("Finished write") @@ -1594,75 +1840,66 @@ Usage: rawdog [OPTION]... General options (use only once): -d|--dir DIR Use DIR instead of ~/.rawdog --v, --verbose Print more detailed status information -N, --no-locking Do not lock the state file +-v, --verbose Print more detailed status information +-V|--log FILE Append detailed status information to FILE -W, --no-lock-wait Exit silently if state file is locked ---help Display this help and exit Actions (performed in order given): --u, --update Fetch data from feeds and store it --l, --list List feeds known at time of last update --w, --write Write out HTML output --f|--update-feed URL Force an update on the single feed URL --c|--config FILE Read additional config file FILE --t, --show-template Print the template currently in use --T, --show-itemtemplate Print the item template currently in use -a|--add URL Try to find a feed associated with URL and add it to the config file +-c|--config FILE Read additional config file FILE +-f|--update-feed URL Force an update on the single feed URL +-l, --list List feeds known at time of last update -r|--remove URL Remove feed URL from the config file +-s|--show TEMPLATE Show the contents of a template + (TEMPLATE may be: page item feedlist feeditem) +-u, --update Fetch data from feeds and store it +-w, --write Write out HTML output Special actions (all other options are ignored if one of these is specified): ---upgrade OLDDIR NEWDIR Import feed state from rawdog 1.x directory - OLDDIR into rawdog 2.x directory NEWDIR +--dump URL Show what rawdog's parser returns for URL +--help Display this help and exit Report bugs to <ats@offog.org>.""" -def load_persisted(fn, klass, config, no_block = False): - """Attempt to load a persisted object. Returns the persister and the - object.""" - config.log("Loading state file: ", fn) - persister = Persister(fn, klass, config.locking) - try: - obj = persister.load(no_block = no_block) - except KeyboardInterrupt: - sys.exit(1) - except: - print "An error occurred while reading state from " + os.getcwd() + "/" + fn + "." - print "This usually means the file is corrupt, and removing it will fix the problem." - sys.exit(1) - return (persister, obj) - -def save_persisted(persister, config): - if persister.object.is_modified(): - config.log("Saving state file: ", persister.filename) - persister.save() - def main(argv): """The command-line interface to the aggregator.""" locale.setlocale(locale.LC_ALL, "") + # This is quite expensive and not threadsafe, so we do it on + # startup and cache the result. global system_encoding - try: - # This doesn't exist on Python 2.2. - # It's also quite expensive, which is why we do it on startup - # and cache the result. - system_encoding = locale.getpreferredencoding() - except: - system_encoding = "UTF-8" + system_encoding = locale.getpreferredencoding() try: - (optlist, args) = getopt.getopt(argv, "ulwf:c:tTd:va:r:NW", ["update", "list", "write", "update-feed=", "help", "config=", "show-template", "dir=", "show-itemtemplate", "verbose", "upgrade", "add=", "remove=", "no-locking", "no-lock-wait"]) + SHORTOPTS = "a:c:d:f:lNr:s:tTuvV:wW" + LONGOPTS = [ + "add=", + "config=", + "dir=", + "dump=", + "help", + "list", + "log=", + "no-lock-wait", + "no-locking", + "remove=", + "show=", + "show-itemtemplate", + "show-template", + "update", + "update-feed=", + "verbose", + "write", + ] + (optlist, args) = getopt.getopt(argv, SHORTOPTS, LONGOPTS) except getopt.GetoptError, s: print s usage() return 1 - for o, a in optlist: - if o == "--upgrade" and len(args) == 2: - import upgrade_1_2 - return upgrade_1_2.upgrade(args[0], args[1]) - if len(args) != 0: usage() return 1 @@ -1671,21 +1908,28 @@ def main(argv): statedir = os.environ["HOME"] + "/.rawdog" else: statedir = None - verbose = 0 - locking = 1 - no_lock_wait = 0 + verbose = False + logfile_name = None + locking = True + no_lock_wait = False for o, a in optlist: - if o == "--help": + if o == "--dump": + import pprint + pprint.pprint(feedparser.parse(a, agent=HTTP_AGENT)) + return 0 + elif o == "--help": usage() return 0 elif o in ("-d", "--dir"): statedir = a - elif o in ("-v", "--verbose"): - verbose = 1 elif o in ("-N", "--no-locking"): - locking = 0 + locking = False + elif o in ("-v", "--verbose"): + verbose = True + elif o in ("-V", "--log"): + logfile_name = a elif o in ("-W", "--no-lock-wait"): - no_lock_wait = 1 + no_lock_wait = True if statedir is None: print "$HOME not set and state dir not explicitly specified; please use -d/--dir" return 1 @@ -1698,7 +1942,7 @@ def main(argv): sys.path.append(".") - config = Config(locking) + config = Config(locking, logfile_name) def load_config(fn): try: config.load(fn) @@ -1708,9 +1952,16 @@ def main(argv): return 1 if verbose: config["verbose"] = True - load_config("config") + return 0 + rc = load_config("config") + if rc != 0: + return rc - persister, rawdog = load_persisted("state", Rawdog, config, no_lock_wait) + global persister + persister = Persister(config) + + rawdog_p = persister.get(Rawdog, "state") + rawdog = rawdog_p.open(no_block=no_lock_wait) if rawdog is None: return 0 if not rawdog.check_state_version(): @@ -1721,36 +1972,39 @@ def main(argv): rawdog.sync_from_config(config) - plugins.call_hook("startup", rawdog, config) + call_hook("startup", rawdog, config) for o, a in optlist: - if o in ("-u", "--update"): - rawdog.update(config) + if o in ("-a", "--add"): + add_feed("config", a, rawdog, config) + config.reload() + rawdog.sync_from_config(config) + elif o in ("-c", "--config"): + rc = load_config(a) + if rc != 0: + return rc + rawdog.sync_from_config(config) elif o in ("-f", "--update-feed"): rawdog.update(config, a) elif o in ("-l", "--list"): rawdog.list(config) - elif o in ("-w", "--write"): - rawdog.write(config) - elif o in ("-c", "--config"): - load_config(a) - rawdog.sync_from_config(config) - elif o in ("-t", "--show-template"): - rawdog.show_template(config) - elif o in ("-T", "--show-itemtemplate"): - rawdog.show_itemtemplate(config) - elif o in ("-a", "--add"): - add_feed("config", a, rawdog, config) - config.reload() - rawdog.sync_from_config(config) elif o in ("-r", "--remove"): remove_feed("config", a, config) config.reload() rawdog.sync_from_config(config) + elif o in ("-s", "--show"): + rawdog.show_template(a, config) + elif o in ("-t", "--show-template"): + rawdog.show_template("page", config) + elif o in ("-T", "--show-itemtemplate"): + rawdog.show_template("item", config) + elif o in ("-u", "--update"): + rawdog.update(config) + elif o in ("-w", "--write"): + rawdog.write(config) - plugins.call_hook("shutdown", rawdog, config) + call_hook("shutdown", rawdog, config) - save_persisted(persister, config) + rawdog_p.close() return 0 - |