summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGiuseppe D'Angelo <giuseppe.dangelo@kdab.com>2018-05-06 20:30:33 +0200
committerTero Kojo <tero.kojo@qt.io>2018-05-07 07:40:52 +0000
commit58f1f8a6411926488342d2f9872811520d8eb580 (patch)
treeb8b3d93568551ddc4f489fa891e0be1c195533c9
parentc7161128b2c74c92a631a4caea518bb41759f516 (diff)
Upgrade to rawdog 2.22
Change-Id: Iccd32db1068ca65ee1e6de88ad137fa3950efde5 Reviewed-by: Tero Kojo <tero.kojo@qt.io>
-rwxr-xr-xrawdog7
-rw-r--r--rawdoglib/__init__.py6
-rw-r--r--rawdoglib/feedfinder.py366
-rw-r--r--rawdoglib/feedscanner.py137
-rw-r--r--rawdoglib/persister.py221
-rw-r--r--rawdoglib/plugins.py14
-rw-r--r--rawdoglib/rawdog.py1176
7 files changed, 1033 insertions, 894 deletions
diff --git a/rawdog b/rawdog
index 878ab78..3715145 100755
--- a/rawdog
+++ b/rawdog
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# rawdog: RSS aggregator without delusions of grandeur.
-# Copyright 2003, 2004, 2005, 2006 Adam Sampson <ats@offog.org>
+# Copyright 2003, 2004, 2005, 2006, 2016 Adam Sampson <ats@offog.org>
#
# rawdog is free software; you can redistribute and/or modify it
# under the terms of that license as published by the Free Software
@@ -18,7 +18,9 @@
# MA 02110-1301, USA, or see http://www.gnu.org/.
from rawdoglib.rawdog import main
-import sys, os
+
+import os
+import sys
def launch():
sys.exit(main(sys.argv[1:]))
@@ -29,4 +31,3 @@ if __name__ == "__main__":
profile.run("launch()")
else:
launch()
-
diff --git a/rawdoglib/__init__.py b/rawdoglib/__init__.py
index c6744f7..382e2c5 100644
--- a/rawdoglib/__init__.py
+++ b/rawdoglib/__init__.py
@@ -1 +1,5 @@
-__all__ = ['feedparser', 'feedfinder', 'timeoutsocket', 'rawdog', 'persister', 'upgrade_1_2']
+__all__ = [
+ 'feedscanner',
+ 'persister',
+ 'rawdog',
+ ]
diff --git a/rawdoglib/feedfinder.py b/rawdoglib/feedfinder.py
deleted file mode 100644
index b4fd28e..0000000
--- a/rawdoglib/feedfinder.py
+++ /dev/null
@@ -1,366 +0,0 @@
-"""feedfinder: Find the Web feed for a Web page
-http://www.aaronsw.com/2002/feedfinder/
-
-Usage:
- feed(uri) - returns feed found for a URI
- feeds(uri) - returns all feeds found for a URI
-
- >>> import feedfinder
- >>> feedfinder.feed('scripting.com')
- 'http://scripting.com/rss.xml'
- >>>
- >>> feedfinder.feeds('scripting.com')
- ['http://delong.typepad.com/sdj/atom.xml',
- 'http://delong.typepad.com/sdj/index.rdf',
- 'http://delong.typepad.com/sdj/rss.xml']
- >>>
-
-Can also use from the command line. Feeds are returned one per line:
-
- $ python feedfinder.py diveintomark.org
- http://diveintomark.org/xml/atom.xml
-
-How it works:
- 0. At every step, feeds are minimally verified to make sure they are really feeds.
- 1. If the URI points to a feed, it is simply returned; otherwise
- the page is downloaded and the real fun begins.
- 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
- 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
- ".atom"
- 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
- 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
- ".atom"
- 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
- 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
- 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
-"""
-
-__version__ = "1.371"
-__date__ = "2006-04-24"
-__maintainer__ = "Aaron Swartz (me@aaronsw.com)"
-__author__ = "Mark Pilgrim (http://diveintomark.org)"
-__copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
-__license__ = "Python"
-__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
-Also Jason Diamond, Brian Lalor for bug reporting and patches"""
-
-_debug = 0
-
-import sgmllib, urllib, urlparse, re, sys, robotparser
-
-import threading
-class TimeoutError(Exception): pass
-def timelimit(timeout):
- """borrowed from web.py"""
- def _1(function):
- def _2(*args, **kw):
- class Dispatch(threading.Thread):
- def __init__(self):
- threading.Thread.__init__(self)
- self.result = None
- self.error = None
-
- self.setDaemon(True)
- self.start()
-
- def run(self):
- try:
- self.result = function(*args, **kw)
- except:
- self.error = sys.exc_info()
-
- c = Dispatch()
- c.join(timeout)
- if c.isAlive():
- raise TimeoutError, 'took too long'
- if c.error:
- raise c.error[0], c.error[1]
- return c.result
- return _2
- return _1
-
-# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
-# Python 2.3 now comes with this module by default, otherwise you can download it
-try:
- import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
-except ImportError:
- xmlrpclib = None
-
-if not dict:
- def dict(aList):
- rc = {}
- for k, v in aList:
- rc[k] = v
- return rc
-
-def _debuglog(message):
- if _debug: print message
-
-class URLGatekeeper:
- """a class to track robots.txt rules across multiple servers"""
- def __init__(self):
- self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
- self.urlopener = urllib.FancyURLopener()
- self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://www.aaronsw.com/2002/feedfinder/"
- _debuglog(self.urlopener.version)
- self.urlopener.addheaders = [('User-agent', self.urlopener.version)]
- robotparser.URLopener.version = self.urlopener.version
- robotparser.URLopener.addheaders = self.urlopener.addheaders
-
- def _getrp(self, url):
- protocol, domain = urlparse.urlparse(url)[:2]
- if self.rpcache.has_key(domain):
- return self.rpcache[domain]
- baseurl = '%s://%s' % (protocol, domain)
- robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
- _debuglog('fetching %s' % robotsurl)
- rp = robotparser.RobotFileParser(robotsurl)
- try:
- rp.read()
- except:
- pass
- self.rpcache[domain] = rp
- return rp
-
- def can_fetch(self, url):
- rp = self._getrp(url)
- allow = rp.can_fetch(self.urlopener.version, url)
- _debuglog("gatekeeper of %s says %s" % (url, allow))
- return allow
-
- @timelimit(10)
- def get(self, url, check=True):
- if check and not self.can_fetch(url): return ''
- try:
- return self.urlopener.open(url).read()
- except:
- return ''
-
-_gatekeeper = URLGatekeeper()
-
-class BaseParser(sgmllib.SGMLParser):
- def __init__(self, baseuri):
- sgmllib.SGMLParser.__init__(self)
- self.links = []
- self.baseuri = baseuri
-
- def normalize_attrs(self, attrs):
- def cleanattr(v):
- v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
- v = v.strip()
- v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
- return v
- attrs = [(k.lower(), cleanattr(v)) for k, v in attrs]
- attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
- return attrs
-
- def do_base(self, attrs):
- attrsD = dict(self.normalize_attrs(attrs))
- if not attrsD.has_key('href'): return
- self.baseuri = attrsD['href']
-
- def error(self, *a, **kw): pass # we're not picky
-
-class LinkParser(BaseParser):
- FEED_TYPES = ('application/rss+xml',
- 'text/xml',
- 'application/atom+xml',
- 'application/x.atom+xml',
- 'application/x-atom+xml')
- def do_link(self, attrs):
- attrsD = dict(self.normalize_attrs(attrs))
- if not attrsD.has_key('rel'): return
- rels = attrsD['rel'].split()
- if 'alternate' not in rels: return
- if attrsD.get('type') not in self.FEED_TYPES: return
- if not attrsD.has_key('href'): return
- self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
-
-class ALinkParser(BaseParser):
- def start_a(self, attrs):
- attrsD = dict(self.normalize_attrs(attrs))
- if not attrsD.has_key('href'): return
- self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
-
-def makeFullURI(uri):
- uri = uri.strip()
- if uri.startswith('feed://'):
- uri = 'http://' + uri.split('feed://', 1).pop()
- for x in ['http', 'https']:
- if uri.startswith('%s://' % x):
- return uri
- return 'http://%s' % uri
-
-def getLinks(data, baseuri):
- p = LinkParser(baseuri)
- p.feed(data)
- return p.links
-
-def getALinks(data, baseuri):
- p = ALinkParser(baseuri)
- p.feed(data)
- return p.links
-
-def getLocalLinks(links, baseuri):
- baseuri = baseuri.lower()
- urilen = len(baseuri)
- return [l for l in links if l.lower().startswith(baseuri)]
-
-def isFeedLink(link):
- return link[link.rfind('.'):].lower() in ('.rss', '.rdf', '.xml', '.atom')
-
-def isXMLRelatedLink(link):
- link = link.lower()
- return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
-
-r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
-def tryBrokenRedirect(data):
- if '<newLocation' in data:
- newuris = r_brokenRedirect.findall(data)
- if newuris: return newuris[0].strip()
-
-def couldBeFeedData(data):
- data = data.lower()
- if data.count('<html'): return 0
- return data.count('<rss') + data.count('<rdf') + data.count('<feed')
-
-def isFeed(uri):
- _debuglog('seeing if %s is a feed' % uri)
- protocol = urlparse.urlparse(uri)
- if protocol[0] not in ('http', 'https'): return 0
- data = _gatekeeper.get(uri)
- return couldBeFeedData(data)
-
-def sortFeeds(feed1Info, feed2Info):
- return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
-
-def getFeedsFromSyndic8(uri):
- feeds = []
- try:
- server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
- feedids = server.syndic8.FindFeeds(uri)
- infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
- infolist.sort(sortFeeds)
- feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
- _debuglog('found %s feeds through Syndic8' % len(feeds))
- except:
- pass
- return feeds
-
-def feeds(uri, all=False, querySyndic8=False, _recurs=None):
- if _recurs is None: _recurs = [uri]
- fulluri = makeFullURI(uri)
- try:
- data = _gatekeeper.get(fulluri, check=False)
- except:
- return []
- # is this already a feed?
- if couldBeFeedData(data):
- return [fulluri]
- newuri = tryBrokenRedirect(data)
- if newuri and newuri not in _recurs:
- _recurs.append(newuri)
- return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
- # nope, it's a page, try LINK tags first
- _debuglog('looking for LINK tags')
- try:
- outfeeds = getLinks(data, fulluri)
- except:
- outfeeds = []
- _debuglog('found %s feeds through LINK tags' % len(outfeeds))
- outfeeds = filter(isFeed, outfeeds)
- if all or not outfeeds:
- # no LINK tags, look for regular <A> links that point to feeds
- _debuglog('no LINK tags, looking at A tags')
- try:
- links = getALinks(data, fulluri)
- except:
- links = []
- locallinks = getLocalLinks(links, fulluri)
- # look for obvious feed links on the same server
- outfeeds.extend(filter(isFeed, filter(isFeedLink, locallinks)))
- if all or not outfeeds:
- # look harder for feed links on the same server
- outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, locallinks)))
- if all or not outfeeds:
- # look for obvious feed links on another server
- outfeeds.extend(filter(isFeed, filter(isFeedLink, links)))
- if all or not outfeeds:
- # look harder for feed links on another server
- outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, links)))
- if all or not outfeeds:
- _debuglog('no A tags, guessing')
- suffixes = [ # filenames used by popular software:
- 'atom.xml', # blogger, TypePad
- 'index.atom', # MT, apparently
- 'index.rdf', # MT
- 'rss.xml', # Dave Winer/Manila
- 'index.xml', # MT
- 'index.rss' # Slash
- ]
- outfeeds.extend(filter(isFeed, [urlparse.urljoin(fulluri, x) for x in suffixes]))
- if (all or not outfeeds) and querySyndic8:
- # still no luck, search Syndic8 for feeds (requires xmlrpclib)
- _debuglog('still no luck, searching Syndic8')
- outfeeds.extend(getFeedsFromSyndic8(uri))
- if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
- outfeeds = list(set(outfeeds))
- return outfeeds
-
-getFeeds = feeds # backwards-compatibility
-
-def feed(uri):
- #todo: give preference to certain feed formats
- feedlist = feeds(uri)
- if feedlist:
- return feedlist[0]
- else:
- return None
-
-##### test harness ######
-
-def test():
- uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
- failed = []
- count = 0
- while 1:
- data = _gatekeeper.get(uri)
- if data.find('Atom autodiscovery test') == -1: break
- sys.stdout.write('.')
- sys.stdout.flush()
- count += 1
- links = getLinks(data, uri)
- if not links:
- print '\n*** FAILED ***', uri, 'could not find link'
- failed.append(uri)
- elif len(links) > 1:
- print '\n*** FAILED ***', uri, 'found too many links'
- failed.append(uri)
- else:
- atomdata = urllib.urlopen(links[0]).read()
- if atomdata.find('<link rel="alternate"') == -1:
- print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
- failed.append(uri)
- else:
- backlink = atomdata.split('href="').pop().split('"')[0]
- if backlink != uri:
- print '\n*** FAILED ***', uri, 'retrieved wrong feed'
- failed.append(uri)
- if data.find('<link rel="next" href="') == -1: break
- uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
- print
- print count, 'tests executed,', len(failed), 'failed'
-
-if __name__ == '__main__':
- args = sys.argv[1:]
- if args and args[0] == '--debug':
- _debug = 1
- args.pop(0)
- if args:
- uri = args[0]
- else:
- uri = 'http://diveintomark.org/'
- if uri == 'test':
- test()
- else:
- print "\n".join(getFeeds(uri))
diff --git a/rawdoglib/feedscanner.py b/rawdoglib/feedscanner.py
new file mode 100644
index 0000000..e0034b2
--- /dev/null
+++ b/rawdoglib/feedscanner.py
@@ -0,0 +1,137 @@
+"""Scan a URL's contents to find related feeds
+
+This is a compatible replacement for Aaron Swartz's feedfinder module,
+using feedparser to check whether the URLs it returns are feeds.
+
+It finds links to feeds within the following elements:
+- <link rel="alternate" ...> (standard feed discovery)
+- <a ...>, if the href contains words that suggest it might be a feed
+
+It orders feeds using a quality heuristic: the first result is the most
+likely to be a feed for the given URL.
+
+Required: Python 2.4 or later, feedparser
+"""
+
+__license__ = """
+Copyright (c) 2008 Decklin Foster <decklin@red-bean.com>
+Copyright (c) 2013, 2015 Adam Sampson <ats@offog.org>
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that
+the above copyright notice and this permission notice appear in all
+copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+"""
+
+import cStringIO
+import feedparser
+import gzip
+import re
+import urllib2
+import urlparse
+import HTMLParser
+
+def is_feed(url):
+ """Return true if feedparser can understand the given URL as a feed."""
+
+ p = feedparser.parse(url)
+ version = p.get("version")
+ if version is None:
+ version = ""
+ return version != ""
+
+def fetch_url(url):
+ """Fetch the given URL and return the data from it as a Unicode string."""
+
+ request = urllib2.Request(url)
+ request.add_header("Accept-Encoding", "gzip")
+
+ f = urllib2.urlopen(request)
+ headers = f.info()
+ data = f.read()
+ f.close()
+
+ # We have to support gzip encoding because some servers will use it
+ # even if you explicitly refuse it in Accept-Encoding.
+ encodings = headers.get("Content-Encoding", "")
+ encodings = [s.strip() for s in encodings.split(",")]
+ if "gzip" in encodings:
+ f = gzip.GzipFile(fileobj=cStringIO.StringIO(data))
+ data = f.read()
+ f.close()
+
+ # Silently ignore encoding errors -- we don't need to go to the bother of
+ # detecting the encoding properly (like feedparser does).
+ data = data.decode("UTF-8", "ignore")
+
+ return data
+
+class FeedFinder(HTMLParser.HTMLParser):
+ def __init__(self, base_uri):
+ HTMLParser.HTMLParser.__init__(self)
+ self.found = []
+ self.count = 0
+ self.base_uri = base_uri
+
+ def add(self, score, href):
+ url = urlparse.urljoin(self.base_uri, href)
+ lower = url.lower()
+
+ # Some sites provide feeds both for entries and comments;
+ # prefer the former.
+ if lower.find("comment") != -1:
+ score -= 50
+
+ # Prefer Atom, then RSS, then RDF (RSS 1).
+ if lower.find("atom") != -1:
+ score += 10
+ elif lower.find("rss2") != -1:
+ score -= 5
+ elif lower.find("rss") != -1:
+ score -= 10
+ elif lower.find("rdf") != -1:
+ score -= 15
+
+ self.found.append((-score, self.count, url))
+ self.count += 1
+
+ def urls(self):
+ return [link[2] for link in sorted(self.found)]
+
+ def handle_starttag(self, tag, attrs):
+ attrs = dict(attrs)
+ href = attrs.get('href')
+ if href is None:
+ return
+ if tag == 'link' and attrs.get('rel') == 'alternate' and \
+ not attrs.get('type') == 'text/html':
+ self.add(200, href)
+ if tag == 'a' and re.search(r'\b(rss|atom|rdf|feeds?)\b', href, re.I):
+ self.add(100, href)
+
+def feeds(page_url):
+ """Search the given URL for possible feeds, returning a list of them."""
+
+ # If the URL is a feed, there's no need to scan it for links.
+ if is_feed(page_url):
+ return [page_url]
+
+ data = fetch_url(page_url)
+ parser = FeedFinder(page_url)
+ try:
+ parser.feed(data)
+ except HTMLParser.HTMLParseError:
+ pass
+ found = parser.urls()
+
+ # Return only feeds that feedparser can understand.
+ return [feed for feed in found if is_feed(feed)]
diff --git a/rawdoglib/persister.py b/rawdoglib/persister.py
index 6c06e2c..40169da 100644
--- a/rawdoglib/persister.py
+++ b/rawdoglib/persister.py
@@ -1,87 +1,192 @@
-# persister: safe class persistance wrapper
-# Copyright 2003, 2004, 2005 Adam Sampson <ats@offog.org>
+# persister: persist Python objects safely to pickle files
+# Copyright 2003, 2004, 2005, 2013, 2014 Adam Sampson <ats@offog.org>
#
-# persister is free software; you can redistribute it and/or modify it
-# under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
+# rawdog is free software; you can redistribute and/or modify it
+# under the terms of that license as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option)
+# any later version.
#
-# persister is distributed in the hope that it will be useful, but
+# rawdog is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
#
-# You should have received a copy of the GNU Lesser General Public
-# License along with persister; see the file COPYING.LGPL. If not,
-# write to the Free Software Foundation, Inc., 51 Franklin Street,
-# Fifth Floor, Boston, MA 02110-1301, USA, or see http://www.gnu.org/.
+# You should have received a copy of the GNU General Public License
+# along with rawdog; see the file COPYING. If not, write to the Free
+# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301, USA, or see http://www.gnu.org/.
-import fcntl, os, errno
import cPickle as pickle
+import errno
+import fcntl
+import os
+import sys
class Persistable:
- """Something which can be persisted. When a subclass of this wants to
- indicate that it has been modified, it should call
- self.modified()."""
- def __init__(self): self._modified = False
- def modified(self, state = True): self._modified = state
- def is_modified(self): return self._modified
+ """An object which can be persisted."""
-class Persister:
- """Persist another class to a file, safely. The class being persisted
- must derive from Persistable (although this isn't enforced)."""
+ def __init__(self):
+ self._modified = False
- def __init__(self, filename, klass, use_locking = True):
- self.filename = filename
+ def modified(self, state=True):
+ """Mark the object as having been modified (or not)."""
+ self._modified = state
+
+ def is_modified(self):
+ return self._modified
+
+class Persisted:
+ """Context manager for a persistent object. The object being persisted
+ must implement the Persistable interface."""
+
+ def __init__(self, klass, filename, persister):
self.klass = klass
- self.use_locking = use_locking
- self.file = None
+ self.filename = filename
+ self.persister = persister
+ self.lock_file = None
self.object = None
+ self.refcount = 0
+
+ def rename(self, new_filename):
+ """Rename the persisted file. This works whether the file is
+ currently open or not."""
+
+ self.persister._rename(self.filename, new_filename)
+ for ext in ("", ".lock"):
+ try:
+ os.rename(self.filename + ext,
+ new_filename + ext)
+ except OSError, e:
+ # If the file doesn't exist (yet),
+ # that's OK.
+ if e.errno != errno.ENOENT:
+ raise e
+ self.filename = new_filename
+
+ def __enter__(self):
+ """As open()."""
+ return self.open()
- def load(self, no_block = True):
- """Load the persisted object from the file, or create a new one
- if this isn't possible. Returns the loaded object."""
+ def __exit__(self, type, value, tb):
+ """As close(), unless an exception occurred in which case do
+ nothing."""
+ if tb is None:
+ self.close()
- def get_lock():
- if not self.use_locking:
- return True
+ def open(self, no_block=True):
+ """Return the persistent object, loading it from its file if it
+ isn't already open. You must call close() once you're finished
+ with the object.
+
+ If no_block is True, then this will return None if loading the
+ object would otherwise block (i.e. if it's locked by another
+ process)."""
+
+ if self.refcount > 0:
+ # Already loaded.
+ self.refcount += 1
+ return self.object
+
+ try:
+ self._open(no_block)
+ except KeyboardInterrupt:
+ sys.exit(1)
+ except:
+ print "An error occurred while reading state from " + os.path.abspath(self.filename) + "."
+ print "This usually means the file is corrupt, and removing it will fix the problem."
+ sys.exit(1)
+
+ self.refcount = 1
+ return self.object
+
+ def _get_lock(self, no_block):
+ if not self.persister.use_locking:
+ return True
+
+ self.lock_file = open(self.filename + ".lock", "w+")
+ try:
mode = fcntl.LOCK_EX
if no_block:
mode |= fcntl.LOCK_NB
- try:
- fcntl.lockf(self.file.fileno(), mode)
- except IOError, e:
- if no_block and e.errno in (errno.EACCES, errno.EAGAIN):
- return False
- raise e
- return True
+ fcntl.lockf(self.lock_file.fileno(), mode)
+ except IOError, e:
+ if no_block and e.errno in (errno.EACCES, errno.EAGAIN):
+ return False
+ raise e
+ return True
+
+ def _open(self, no_block):
+ self.persister.log("Loading state file: ", self.filename)
+
+ if not self._get_lock(no_block):
+ return None
try:
- self.file = open(self.filename, "r+")
- if not get_lock():
- return None
- self.object = pickle.load(self.file)
- self.object.modified(False)
+ f = open(self.filename, "rb")
except IOError:
- self.file = open(self.filename, "w+")
- if not get_lock():
- return None
+ # File can't be opened.
+ # Create a new object.
self.object = self.klass()
self.object.modified()
- return self.object
+ return
+
+ self.object = pickle.load(f)
+ self.object.modified(False)
+ f.close()
+
+ def close(self):
+ """Reduce the reference count of the persisted object, saving
+ it back to its file if necessary."""
+
+ self.refcount -= 1
+ if self.refcount > 0:
+ # Still in use.
+ return
- def save(self):
- """Save the persisted object back to the file if necessary."""
if self.object.is_modified():
+ self.persister.log("Saving state file: ", self.filename)
newname = "%s.new-%d" % (self.filename, os.getpid())
newfile = open(newname, "w")
- try:
- pickle.dump(self.object, newfile, pickle.HIGHEST_PROTOCOL)
- except AttributeError:
- # Python 2.2 doesn't have the protocol
- # argument.
- pickle.dump(self.object, newfile, True)
+ pickle.dump(self.object, newfile, pickle.HIGHEST_PROTOCOL)
newfile.close()
os.rename(newname, self.filename)
- self.file.close()
+ if self.lock_file is not None:
+ self.lock_file.close()
+ self.persister._remove(self.filename)
+
+class Persister:
+ """Manage the collection of persisted files."""
+
+ def __init__(self, config):
+ self.files = {}
+ self.log = config.log
+ self.use_locking = config.locking
+
+ def get(self, klass, filename):
+ """Get a context manager for a persisted file.
+ If the file is already open, this will return
+ the existing context manager."""
+
+ if filename in self.files:
+ return self.files[filename]
+
+ p = Persisted(klass, filename, self)
+ self.files[filename] = p
+ return p
+
+ def _rename(self, old_filename, new_filename):
+ self.files[new_filename] = self.files[old_filename]
+ del self.files[old_filename]
+
+ def _remove(self, filename):
+ del self.files[filename]
+
+ def delete(self, filename):
+ """Delete a persisted file, along with its lock file,
+ if they exist."""
+ for ext in ("", ".lock"):
+ try:
+ os.unlink(filename + ext)
+ except OSError:
+ pass
diff --git a/rawdoglib/plugins.py b/rawdoglib/plugins.py
index 9dcb2e7..447b269 100644
--- a/rawdoglib/plugins.py
+++ b/rawdoglib/plugins.py
@@ -1,5 +1,5 @@
# plugins: handle add-on modules for rawdog.
-# Copyright 2004, 2005 Adam Sampson <ats@offog.org>
+# Copyright 2004, 2005, 2013, 2016 Adam Sampson <ats@offog.org>
#
# rawdog is free software; you can redistribute and/or modify it
# under the terms of that license as published by the Free Software
@@ -16,12 +16,17 @@
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA, or see http://www.gnu.org/.
-import os, imp
+# The design of rawdog's plugin API was inspired by Stuart Langridge's
+# Vellum weblog system:
+# http://www.kryogenix.org/code/vellum/
+
+import imp
+import os
class Box:
"""Utility class that holds a mutable value. Useful for passing
immutable types by reference."""
- def __init__(self, value = None):
+ def __init__(self, value=None):
self.value = value
plugin_count = 0
@@ -49,7 +54,7 @@ def load_plugins(dir, config):
fn = os.path.join(dir, file)
config.log("Loading plugin ", fn)
f = open(fn, "r")
- mod = imp.load_module("plugin%d" % (plugin_count,), f, fn, desc)
+ imp.load_module("plugin%d" % (plugin_count,), f, fn, desc)
plugin_count += 1
f.close()
@@ -70,4 +75,3 @@ def call_hook(hookname, *args):
if not func(*args):
return True
return False
-
diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py
index afb90e6..06139dd 100644
--- a/rawdoglib/rawdog.py
+++ b/rawdoglib/rawdog.py
@@ -1,5 +1,5 @@
# rawdog: RSS aggregator without delusions of grandeur.
-# Copyright 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Adam Sampson <ats@offog.org>
+# Copyright 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012, 2013, 2014, 2015, 2016 Adam Sampson <ats@offog.org>
#
# rawdog is free software; you can redistribute and/or modify it
# under the terms of that license as published by the Free Software
@@ -16,26 +16,32 @@
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA, or see http://www.gnu.org/.
-VERSION = "2.13"
+VERSION = "2.22"
+HTTP_AGENT = "rawdog/" + VERSION
STATE_VERSION = 2
-import feedparser, plugins
-from persister import Persistable, Persister
-import os, time, getopt, sys, re, cgi, socket, urllib2, calendar
-import string, locale
-from StringIO import StringIO
-import types
-
-try:
- import threading
- have_threading = 1
-except:
- have_threading = 0
-try:
- import hashlib
-except:
- hashlib = None
- import sha
+import rawdoglib.feedscanner
+from rawdoglib.persister import Persistable, Persister
+from rawdoglib.plugins import Box, call_hook, load_plugins
+
+from cStringIO import StringIO
+import base64
+import calendar
+import cgi
+import feedparser
+import getopt
+import hashlib
+import locale
+import os
+import re
+import socket
+import string
+import sys
+import threading
+import time
+import types
+import urllib2
+import urlparse
try:
import tidylib
@@ -47,26 +53,18 @@ try:
except:
mxtidy = None
-try:
- import feedfinder
-except:
- feedfinder = None
+# Turn off content-cleaning, since we want to see an approximation to the
+# original content for hashing. rawdog will sanitise HTML when writing.
+feedparser.RESOLVE_RELATIVE_URIS = 0
+feedparser.SANITIZE_HTML = 0
-def new_sha1(s = ""):
- """Return a new SHA1 hash object."""
- if hashlib is None:
- return sha.new(s)
- else:
- return hashlib.sha1(s)
+# Disable microformat support, because it tends to return poor-quality data
+# (e.g. identifying inappropriate things as enclosures), and it relies on
+# BeautifulSoup which is unable to parse many feeds.
+feedparser.PARSE_MICROFORMATS = 0
-def set_socket_timeout(n):
- """Set the system socket timeout."""
- if hasattr(socket, "setdefaulttimeout"):
- socket.setdefaulttimeout(n)
- else:
- # Python 2.2 and earlier need to use an external module.
- import timeoutsocket
- timeoutsocket.setDefaultSocketTimeout(n)
+# This is initialised in main().
+persister = None
system_encoding = None
def get_system_encoding():
@@ -76,12 +74,18 @@ def get_system_encoding():
def safe_ftime(format, t):
"""Format a time value into a string in the current locale (as
time.strftime), but encode the result as ASCII HTML."""
- u = unicode(time.strftime(format, t), get_system_encoding())
+ try:
+ u = unicode(time.strftime(format, t), get_system_encoding())
+ except ValueError, e:
+ u = u"(bad time %s; %s)" % (repr(t), str(e))
return encode_references(u)
def format_time(secs, config):
"""Format a time and date nicely."""
- t = time.localtime(secs)
+ try:
+ t = time.localtime(secs)
+ except ValueError, e:
+ return u"(bad time %s; %s)" % (repr(secs), str(e))
format = config["datetimeformat"]
if format is None:
format = config["timeformat"] + ", " + config["dayformat"]
@@ -130,13 +134,17 @@ def sanitise_html(html, baseurl, inline, config):
html = "<p>" + html
if config["tidyhtml"]:
- args = {"numeric_entities": 1,
- "output_html": 1,
- "output_xhtml": 0,
- "output_xml": 0,
- "wrap": 0}
- plugins.call_hook("mxtidy_args", config, args, baseurl, inline)
- plugins.call_hook("tidy_args", config, args, baseurl, inline)
+ args = {
+ "numeric_entities": 1,
+ "input_encoding": "ascii",
+ "output_encoding": "ascii",
+ "output_html": 1,
+ "output_xhtml": 0,
+ "output_xml": 0,
+ "wrap": 0,
+ }
+ call_hook("mxtidy_args", config, args, baseurl, inline)
+ call_hook("tidy_args", config, args, baseurl, inline)
if tidylib is not None:
# Disable PyTidyLib's somewhat unhelpful defaults.
tidylib.BASE_OPTIONS = {}
@@ -150,16 +158,18 @@ def sanitise_html(html, baseurl, inline, config):
: output.rfind("</body>")].strip()
html = html.decode("UTF-8")
- box = plugins.Box(html)
- plugins.call_hook("clean_html", config, box, baseurl, inline)
+ box = Box(html)
+ call_hook("clean_html", config, box, baseurl, inline)
return box.value
def select_detail(details):
"""Pick the preferred type of detail from a list of details. (If the
argument isn't a list, treat it as a list of one.)"""
- types = {"text/html": 30,
- "application/xhtml+xml": 20,
- "text/plain": 10}
+ TYPES = {
+ "text/html": 30,
+ "application/xhtml+xml": 20,
+ "text/plain": 10,
+ }
if details is None:
return None
@@ -171,8 +181,8 @@ def select_detail(details):
ctype = detail.get("type", None)
if ctype is None:
continue
- if types.has_key(ctype):
- score = types[ctype]
+ if TYPES.has_key(ctype):
+ score = TYPES[ctype]
else:
score = 0
if detail["value"] != "":
@@ -184,7 +194,7 @@ def select_detail(details):
else:
return ds[-1][1]
-def detail_to_html(details, inline, config, force_preformatted = False):
+def detail_to_html(details, inline, config, force_preformatted=False):
"""Convert a detail hash or list of detail hashes as returned by
feedparser into HTML."""
detail = select_detail(details)
@@ -212,14 +222,14 @@ def author_to_html(entry, feedurl, config):
url = None
fallback = "author"
if author_detail is not None:
- if author_detail.has_key("url"):
- url = author_detail["url"]
+ if author_detail.has_key("href"):
+ url = author_detail["href"]
elif author_detail.has_key("email") and author_detail["email"] is not None:
url = "mailto:" + author_detail["email"]
if author_detail.has_key("email") and author_detail["email"] is not None:
fallback = author_detail["email"]
- elif author_detail.has_key("url") and author_detail["url"] is not None:
- fallback = author_detail["url"]
+ elif author_detail.has_key("href") and author_detail["href"] is not None:
+ fallback = author_detail["href"]
if name == "":
name = fallback
@@ -242,8 +252,8 @@ def fill_template(template, bits):
including sections bracketed by __if_x__ .. [__else__ ..]
__endif__ if bits["x"] is not "". If not bits.has_key("x"),
__x__ expands to ""."""
- result = plugins.Box()
- plugins.call_hook("fill_template", template, bits, result)
+ result = Box()
+ call_hook("fill_template", template, bits, result)
if result.value is not None:
return result.value
@@ -279,12 +289,24 @@ def fill_template(template, bits):
file_cache = {}
def load_file(name):
- """Read the contents of a file, caching the result so we don't have to
- read the file multiple times."""
+ """Read the contents of a template file, caching the result so we don't
+ have to read the file multiple times. The file is assumed to be in the
+ system encoding; the result will be an ASCII string."""
if not file_cache.has_key(name):
- f = open(name)
- file_cache[name] = f.read()
- f.close()
+ try:
+ f = open(name)
+ data = f.read()
+ f.close()
+ except IOError:
+ raise ConfigError("Can't read template file: " + name)
+
+ try:
+ data = data.decode(get_system_encoding())
+ except UnicodeDecodeError, e:
+ raise ConfigError("Character encoding problem in template file: " + name + ": " + str(e))
+
+ data = encode_references(data)
+ file_cache[name] = data.encode(get_system_encoding())
return file_cache[name]
def write_ascii(f, s, config):
@@ -299,7 +321,7 @@ def write_ascii(f, s, config):
def short_hash(s):
"""Return a human-manipulatable 'short hash' of a string."""
- return new_sha1(s).hexdigest()[-8:]
+ return hashlib.sha1(s).hexdigest()[-8:]
def ensure_unicode(value, encoding):
"""Convert a structure returned by feedparser into an equivalent where
@@ -327,6 +349,81 @@ def ensure_unicode(value, encoding):
else:
return value
+timeout_re = re.compile(r'timed? ?out', re.I)
+def is_timeout_exception(exc):
+ """Return True if the given exception object suggests that a timeout
+ occurred, else return False."""
+
+ # Since urlopen throws away the original exception object,
+ # we have to look at the stringified form to tell if it was a timeout.
+ # (We're in reasonable company here, since test_ssl.py in the Python
+ # distribution does the same thing!)
+ #
+ # The message we're looking for is something like:
+ # Stock Python 2.7.7 and 2.7.8:
+ # <urlopen error _ssl.c:495: The handshake operation timed out>
+ # Debian python 2.7.3-4+deb7u1:
+ # <urlopen error _ssl.c:489: The handshake operation timed out>
+ # Debian python 2.7.8-1:
+ # <urlopen error ('_ssl.c:563: The handshake operation timed out',)>
+ return timeout_re.search(str(exc)) is not None
+
+class BasicAuthProcessor(urllib2.BaseHandler):
+ """urllib2 handler that does HTTP basic authentication
+ or proxy authentication with a fixed username and password.
+ (Unlike the classes to do this in urllib2, this doesn't wait
+ for a 401/407 response first.)"""
+
+ def __init__(self, user, password, proxy=False):
+ self.auth = base64.b64encode(user + ":" + password)
+ if proxy:
+ self.header = "Proxy-Authorization"
+ else:
+ self.header = "Authorization"
+
+ def http_request(self, req):
+ req.add_header(self.header, "Basic " + self.auth)
+ return req
+
+ https_request = http_request
+
+class DisableIMProcessor(urllib2.BaseHandler):
+ """urllib2 handler that disables RFC 3229 for a request."""
+
+ def http_request(self, req):
+ # Request doesn't provide a method for removing headers --
+ # so overwrite the header instead.
+ req.add_header("A-IM", "identity")
+ return req
+
+ https_request = http_request
+
+class ResponseLogProcessor(urllib2.BaseHandler):
+ """urllib2 handler that maintains a log of HTTP responses."""
+
+ # Run after anything that's mangling headers (usually 500 or less), but
+ # before HTTPErrorProcessor (1000).
+ handler_order = 900
+
+ def __init__(self):
+ self.log = []
+
+ def http_response(self, req, response):
+ entry = {
+ "url": req.get_full_url(),
+ "status": response.getcode(),
+ }
+ location = response.info().get("Location")
+ if location is not None:
+ entry["location"] = location
+ self.log.append(entry)
+ return response
+
+ https_response = http_response
+
+ def get_log(self):
+ return self.log
+
non_alphanumeric_re = re.compile(r'<[^>]*>|\&[^\;]*\;|[^a-z0-9]')
class Feed:
"""An RSS feed."""
@@ -341,12 +438,9 @@ class Feed:
self.feed_info = {}
def needs_update(self, now):
- """Return 1 if it's time to update this feed, or 0 if its
- update period has not yet elapsed."""
- if (now - self.last_update) < self.period:
- return 0
- else:
- return 1
+ """Return True if it's time to update this feed, or False if
+ its update period has not yet elapsed."""
+ return (now - self.last_update) >= self.period
def get_state_filename(self):
return "feeds/%s.state" % (short_hash(self.url),)
@@ -356,100 +450,186 @@ class Feed:
handlers = []
+ logger = ResponseLogProcessor()
+ handlers.append(logger)
+
proxies = {}
- for key, arg in self.args.items():
- if key.endswith("_proxy"):
- proxies[key[:-6]] = arg
+ for name, value in self.args.items():
+ if name.endswith("_proxy"):
+ proxies[name[:-6]] = value
if len(proxies) != 0:
handlers.append(urllib2.ProxyHandler(proxies))
if self.args.has_key("proxyuser") and self.args.has_key("proxypassword"):
- mgr = DummyPasswordMgr((self.args["proxyuser"], self.args["proxypassword"]))
- handlers.append(urllib2.ProxyBasicAuthHandler(mgr))
-
- plugins.call_hook("add_urllib2_handlers", rawdog, config, self, handlers)
+ handlers.append(BasicAuthProcessor(self.args["proxyuser"], self.args["proxypassword"], proxy=True))
- auth_creds = None
if self.args.has_key("user") and self.args.has_key("password"):
- auth_creds = (self.args["user"], self.args["password"])
+ handlers.append(BasicAuthProcessor(self.args["user"], self.args["password"]))
- use_im = True
if self.get_keepmin(config) == 0 or config["currentonly"]:
- use_im = False
+ # If RFC 3229 and "A-IM: feed" is used, then there's
+ # no way to tell when an article has been removed.
+ # So if we only want to keep articles that are still
+ # being published by the feed, we have to turn it off.
+ handlers.append(DisableIMProcessor())
+
+ call_hook("add_urllib2_handlers", rawdog, config, self, handlers)
+
+ url = self.url
+ # Turn plain filenames into file: URLs. (feedparser will open
+ # plain filenames itself, but we want it to open the file with
+ # urllib2 so we get a URLError if something goes wrong.)
+ if not ":" in url:
+ url = "file:" + url
try:
- return feedparser.parse(self.url,
- etag = self.etag,
- modified = self.modified,
- agent = "rawdog/" + VERSION,
- handlers = handlers,
- auth_creds = auth_creds,
- use_im = use_im)
+ result = feedparser.parse(url,
+ etag=self.etag,
+ modified=self.modified,
+ agent=HTTP_AGENT,
+ handlers=handlers)
except Exception, e:
- return {
+ result = {
"rawdog_exception": e,
"rawdog_traceback": sys.exc_info()[2],
}
+ result["rawdog_responses"] = logger.get_log()
+ return result
def update(self, rawdog, now, config, articles, p):
"""Add new articles from a feed to the collection.
Returns True if any articles were read, False otherwise."""
- status = p.get("status")
+ # Note that feedparser might have thrown an exception --
+ # so until we print the error message and return, we
+ # can't assume that p contains any particular field.
+
+ responses = p.get("rawdog_responses")
+ if len(responses) > 0:
+ last_status = responses[-1]["status"]
+ elif len(p.get("feed", [])) != 0:
+ # Some protocol other than HTTP -- assume it's OK,
+ # since we got some content.
+ last_status = 200
+ else:
+ # Timeout, or empty response from non-HTTP.
+ last_status = 0
+
+ version = p.get("version")
+ if version is None:
+ version = ""
+
self.last_update = now
- error = None
- non_fatal = False
+ errors = []
+ fatal = False
old_url = self.url
+
if "rawdog_exception" in p:
- error = "Error fetching or parsing feed: %s" % str(p["rawdog_exception"])
+ errors.append("Error fetching or parsing feed:")
+ errors.append(str(p["rawdog_exception"]))
if config["showtracebacks"]:
from traceback import format_tb
- error += "\n" + "".join(format_tb(p["rawdog_traceback"]))
- elif status is None and len(p["feed"]) == 0:
- if config["ignoretimeouts"]:
- return False
+ errors.append("".join(format_tb(p["rawdog_traceback"])))
+ errors.append("")
+ fatal = True
+
+ if len(responses) != 0 and responses[0]["status"] == 301:
+ # Permanent redirect(s). Find the new location.
+ i = 0
+ while i < len(responses) and responses[i]["status"] == 301:
+ i += 1
+ location = responses[i - 1].get("location")
+
+ # According to RFC 2616, the Location header should be
+ # an absolute URI. This doesn't stop the occasional
+ # server sending something like "Location: /" or
+ # "Location: //foo/bar". It's usually a sign of
+ # brokenness, so fail rather than trying to interpret
+ # it liberally.
+ valid_uri = True
+ if location is not None:
+ parsed = urlparse.urlparse(location)
+ if parsed.scheme == "" or parsed.netloc == "":
+ valid_uri = False
+
+ if not valid_uri:
+ errors.append("New URL: " + location)
+ errors.append("The feed returned a permanent redirect, but with an invalid new location.")
+ elif location is None:
+ errors.append("The feed returned a permanent redirect, but without a new location.")
else:
- error = "Timeout while reading feed."
- elif status is None:
- # Fetched by some protocol that doesn't have status.
- pass
- elif status == 301:
- # Permanent redirect. The feed URL needs changing.
-
- error = "New URL: " + p["url"] + "\n"
- error += "The feed has moved permanently to a new URL.\n"
- if config["changeconfig"]:
- rawdog.change_feed_url(self.url, p["url"], config)
- error += "The config file has been updated automatically."
+ errors.append("New URL: " + location)
+ errors.append("The feed has moved permanently to a new URL.")
+ if config["changeconfig"]:
+ rawdog.change_feed_url(self.url, location, config)
+ errors.append("The config file has been updated automatically.")
+ else:
+ errors.append("You should update its entry in your config file.")
+ errors.append("")
+
+ bozo_exception = p.get("bozo_exception")
+ got_urlerror = isinstance(bozo_exception, urllib2.URLError)
+ got_timeout = isinstance(bozo_exception, socket.timeout)
+ if got_urlerror or got_timeout:
+ # urllib2 reported an error when fetching the feed.
+ # Check to see if it was a timeout.
+ if not (got_timeout or is_timeout_exception(bozo_exception)):
+ errors.append("Error while fetching feed:")
+ errors.append(str(bozo_exception))
+ errors.append("")
+ fatal = True
+ elif config["ignoretimeouts"]:
+ return False
else:
- error += "You should update its entry in your config file."
- non_fatal = True
- elif status in [403, 410]:
- # The feed is disallowed or gone. The feed should be unsubscribed.
- error = "The feed has gone.\n"
- error += "You should remove it from your config file."
- elif status / 100 in [4, 5]:
- # Some sort of client or server error. The feed may need unsubscribing.
- error = "The feed returned an error.\n"
- error += "If this condition persists, you should remove it from your config file."
-
- plugins.call_hook("feed_fetched", rawdog, config, self, p, error, non_fatal)
-
- if error is not None:
+ errors.append("Timeout while reading feed.")
+ errors.append("")
+ fatal = True
+ elif last_status == 304:
+ # The feed hasn't changed. Return False to indicate
+ # that we shouldn't do expiry.
+ return False
+ elif last_status in [403, 410]:
+ # The feed is disallowed or gone. The feed should be
+ # unsubscribed.
+ errors.append("The feed has gone.")
+ errors.append("You should remove it from your config file.")
+ errors.append("")
+ fatal = True
+ elif last_status / 100 != 2:
+ # Some sort of client or server error. The feed may
+ # need unsubscribing.
+ errors.append("The feed returned an error.")
+ errors.append("If this condition persists, you should remove it from your config file.")
+ errors.append("")
+ fatal = True
+ elif version == "" and len(p.get("entries", [])) == 0:
+ # feedparser couldn't detect the type of this feed or
+ # retrieve any entries from it.
+ errors.append("The data retrieved from this URL could not be understood as a feed.")
+ errors.append("You should check whether the feed has changed URLs or been removed.")
+ errors.append("")
+ fatal = True
+
+ old_error = "\n".join(errors)
+ call_hook("feed_fetched", rawdog, config, self, p, old_error, not fatal)
+
+ if len(errors) != 0:
print >>sys.stderr, "Feed: " + old_url
- if status is not None:
- print >>sys.stderr, "HTTP Status: " + str(status)
- print >>sys.stderr, error
- print >>sys.stderr
- if not non_fatal:
+ if last_status != 0:
+ print >>sys.stderr, "HTTP Status: " + str(last_status)
+ for line in errors:
+ print >>sys.stderr, line
+ if fatal:
return False
+ # From here, we can assume that we've got a complete feedparser
+ # response.
+
p = ensure_unicode(p, p.get("encoding") or "UTF-8")
- # In the event that the feed hasn't changed, then both channel
- # and feed will be empty. In this case we return 0 so that
- # we know not to expire articles that came from this feed.
+ # No entries means the feed hasn't changed, but for some reason
+ # we didn't get a 304 response. Handle it the same way.
if len(p["entries"]) == 0:
return False
@@ -467,15 +647,15 @@ class Feed:
if a.feed == feed and id is not None:
article_ids[id] = a
- seen = {}
+ seen_articles = set()
sequence = 0
for entry_info in p["entries"]:
article = Article(feed, entry_info, now, sequence)
- ignore = plugins.Box(False)
- plugins.call_hook("article_seen", rawdog, config, article, ignore)
+ ignore = Box(False)
+ call_hook("article_seen", rawdog, config, article, ignore)
if ignore.value:
continue
- seen[article.hash] = True
+ seen_articles.add(article.hash)
sequence += 1
id = entry_info.get("id")
@@ -488,14 +668,14 @@ class Feed:
if existing_article is not None:
existing_article.update_from(article, now)
- plugins.call_hook("article_updated", rawdog, config, existing_article, now)
+ call_hook("article_updated", rawdog, config, existing_article, now)
else:
articles[article.hash] = article
- plugins.call_hook("article_added", rawdog, config, article, now)
+ call_hook("article_added", rawdog, config, article, now)
if config["currentonly"]:
for (hash, a) in articles.items():
- if a.feed == feed and not seen.has_key(hash):
+ if a.feed == feed and hash not in seen_articles:
del articles[hash]
return True
@@ -526,10 +706,7 @@ class Feed:
return non_alphanumeric_re.sub('', r)
def get_keepmin(self, config):
- try:
- return int(self.args["keepmin"])
- except:
- return config["keepmin"]
+ return self.args.get("keepmin", config["keepmin"])
class Article:
"""An article retrieved from an RSS feed."""
@@ -539,14 +716,15 @@ class Article:
self.entry_info = entry_info
self.sequence = sequence
- # HOTFIX for missing date sorting.
- # TODO: Why is this needed?
- #modified = entry_info.get("modified_parsed")
- modified = entry_info.get("updated_parsed") or entry_info.get("published_parsed")
self.date = None
- if modified is not None:
+ parsed = entry_info.get("updated_parsed")
+ if parsed is None:
+ parsed = entry_info.get("published_parsed")
+ if parsed is None:
+ parsed = entry_info.get("created_parsed")
+ if parsed is not None:
try:
- self.date = calendar.timegm(modified)
+ self.date = calendar.timegm(parsed)
except OverflowError:
pass
@@ -561,21 +739,21 @@ class Article:
system (i.e. it can't just be the article ID, because that
would collide if more than one feed included the same
article)."""
- h = new_sha1()
+ h = hashlib.sha1()
def add_hash(s):
h.update(s.encode("UTF-8"))
add_hash(self.feed)
entry_info = self.entry_info
- if entry_info.has_key("title_raw"):
- add_hash(entry_info["title_raw"])
+ if entry_info.has_key("title"):
+ add_hash(entry_info["title"])
if entry_info.has_key("link"):
add_hash(entry_info["link"])
if entry_info.has_key("content"):
for content in entry_info["content"]:
- add_hash(content["value_raw"])
+ add_hash(content["value"])
if entry_info.has_key("summary_detail"):
- add_hash(entry_info["summary_detail"]["value_raw"])
+ add_hash(entry_info["summary_detail"]["value"])
return h.hexdigest()
@@ -588,7 +766,7 @@ class Article:
self.last_seen = now
def can_expire(self, now, config):
- return ((now - self.last_seen) > config["expireage"])
+ return (now - self.last_seen) > config["expireage"]
def get_sort_date(self, config):
if config["sortbyfeeddate"]:
@@ -600,7 +778,7 @@ class DayWriter:
"""Utility class for writing day sections into a series of articles."""
def __init__(self, file, config):
- self.lasttime = [-1, -1, -1, -1, -1]
+ self.lasttime = []
self.file = file
self.counter = 0
self.config = config
@@ -618,7 +796,11 @@ class DayWriter:
self.counter += 1
def time(self, s):
- tm = time.localtime(s)
+ try:
+ tm = time.localtime(s)
+ except ValueError:
+ # e.g. "timestamp out of range for platform time_t"
+ return
if tm[:3] != self.lasttime[:3] and self.config["daysections"]:
self.close(0)
self.start_day(tm)
@@ -630,17 +812,23 @@ class DayWriter:
self.start_time(tm)
self.lasttime = tm
- def close(self, n = 0):
+ def close(self, n=0):
while self.counter > n:
print >>self.file, "</div>"
self.counter -= 1
-def parse_time(value, default = "m"):
+def parse_time(value, default="m"):
"""Parse a time period with optional units (s, m, h, d, w) into a time
in seconds. If no unit is specified, use minutes by default; specify
the default argument to change this. Raises ValueError if the format
isn't recognised."""
- units = { "s" : 1, "m" : 60, "h" : 3600, "d" : 86400, "w" : 604800 }
+ units = {
+ "s": 1,
+ "m": 60,
+ "h": 3600,
+ "d": 86400,
+ "w": 604800,
+ }
for unit, size in units.items():
if value.endswith(unit):
return int(value[:-len(unit)]) * size
@@ -651,9 +839,9 @@ def parse_bool(value):
the value isn't recognised."""
value = value.strip().lower()
if value == "0" or value == "false":
- return 0
+ return False
elif value == "1" or value == "true":
- return 1
+ return True
else:
raise ValueError("Bad boolean value: " + value)
@@ -662,7 +850,8 @@ def parse_list(value):
return value.strip().split(None)
def parse_feed_args(argparams, arglines):
- """Parse a list of feed arguments. Raise ConfigError if the syntax is invalid."""
+ """Parse a list of feed arguments. Raise ConfigError if the syntax is
+ invalid, or ValueError if an argument value can't be parsed."""
args = {}
for p in argparams:
ps = p.split("=", 1)
@@ -674,23 +863,36 @@ def parse_feed_args(argparams, arglines):
if len(ps) != 2:
raise ConfigError("Bad argument line in config: " + p)
args[ps[0]] = ps[1]
- if "maxage" in args:
- args["maxage"] = parse_time(args["maxage"])
+ for name, value in args.items():
+ if name == "allowduplicates":
+ args[name] = parse_bool(value)
+ elif name == "keepmin":
+ args[name] = int(value)
+ elif name == "maxage":
+ args[name] = parse_time(value)
return args
-class ConfigError(Exception): pass
+class ConfigError(Exception):
+ pass
class Config:
"""The aggregator's configuration."""
- def __init__(self, locking):
+ def __init__(self, locking=True, logfile_name=None):
self.locking = locking
self.files_loaded = []
- if have_threading:
- self.loglock = threading.Lock()
+ self.loglock = threading.Lock()
+ self.logfile = None
+ if logfile_name:
+ self.logfile = open(logfile_name, "a")
self.reset()
def reset(self):
+ # Note that these default values are *not* the same as
+ # in the supplied config file. The idea is that someone
+ # who has an old config file shouldn't notice a difference
+ # in behaviour on upgrade -- so new options generally
+ # default to False here, and True in the sample file.
self.config = {
"feedslist" : [],
"feeddefaults" : {},
@@ -703,30 +905,38 @@ class Config:
"dayformat" : "%A, %d %B %Y",
"timeformat" : "%I:%M %p",
"datetimeformat" : None,
- "userefresh" : 0,
- "showfeeds" : 1,
+ "userefresh" : False,
+ "showfeeds" : True,
"timeout" : 30,
- "template" : "default",
+ "pagetemplate" : "default",
"itemtemplate" : "default",
- "verbose" : 0,
- "ignoretimeouts" : 0,
- "showtracebacks" : 0,
- "daysections" : 1,
- "timesections" : 1,
- "blocklevelhtml" : 1,
- "tidyhtml" : 0,
- "sortbyfeeddate" : 0,
- "currentonly" : 0,
- "hideduplicates" : "",
+ "feedlisttemplate" : "default",
+ "feeditemtemplate" : "default",
+ "verbose" : False,
+ "ignoretimeouts" : False,
+ "showtracebacks" : False,
+ "daysections" : True,
+ "timesections" : True,
+ "blocklevelhtml" : True,
+ "tidyhtml" : False,
+ "sortbyfeeddate" : False,
+ "currentonly" : False,
+ "hideduplicates" : [],
"newfeedperiod" : "3h",
- "changeconfig": 0,
- "numthreads": 0,
- "splitstate": 0,
- "useids": 0,
+ "changeconfig": False,
+ "numthreads": 1,
+ "splitstate": False,
+ "useids": False,
}
- def __getitem__(self, key): return self.config[key]
- def __setitem__(self, key, value): self.config[key] = value
+ def __getitem__(self, key):
+ return self.config[key]
+
+ def get(self, key, default=None):
+ return self.config.get(key, default)
+
+ def __setitem__(self, key, value):
+ self.config[key] = value
def reload(self):
self.log("Reloading config files")
@@ -734,7 +944,7 @@ class Config:
for filename in self.files_loaded:
self.load(filename, False)
- def load(self, filename, explicitly_loaded = True):
+ def load(self, filename, explicitly_loaded=True):
"""Load configuration from a config file."""
if explicitly_loaded:
self.files_loaded.append(filename)
@@ -743,7 +953,12 @@ class Config:
try:
f = open(filename, "r")
for line in f.xreadlines():
- stripped = line.decode(get_system_encoding()).strip()
+ try:
+ line = line.decode(get_system_encoding())
+ except UnicodeDecodeError, e:
+ raise ConfigError("Character encoding problem in config file: " + filename + ": " + str(e))
+
+ stripped = line.strip()
if stripped == "" or stripped[0] == "#":
continue
if line[0] in string.whitespace:
@@ -771,6 +986,11 @@ class Config:
elif len(l) != 2:
raise ConfigError("Bad line in config: " + line)
+ # Load template files immediately, so we produce an error now
+ # rather than later if anything goes wrong.
+ if l[0].endswith("template") and l[1] != "default":
+ load_file(l[1])
+
handled_arglines = False
if l[0] == "feed":
l = l[1].split(None)
@@ -788,7 +1008,7 @@ class Config:
self["defines"][l[0]] = l[1]
elif l[0] == "plugindirs":
for dir in parse_list(l[1]):
- plugins.load_plugins(dir, self)
+ load_plugins(dir, self)
elif l[0] == "outputfile":
self["outputfile"] = l[1]
elif l[0] == "maxarticles":
@@ -811,10 +1031,14 @@ class Config:
self["showfeeds"] = parse_bool(l[1])
elif l[0] == "timeout":
self["timeout"] = parse_time(l[1], "s")
- elif l[0] == "template":
- self["template"] = l[1]
+ elif l[0] in ("template", "pagetemplate"):
+ self["pagetemplate"] = l[1]
elif l[0] == "itemtemplate":
self["itemtemplate"] = l[1]
+ elif l[0] == "feedlisttemplate":
+ self["feedlisttemplate"] = l[1]
+ elif l[0] == "feeditemtemplate":
+ self["feeditemtemplate"] = l[1]
elif l[0] == "verbose":
self["verbose"] = parse_bool(l[1])
elif l[0] == "ignoretimeouts":
@@ -847,9 +1071,9 @@ class Config:
self["useids"] = parse_bool(l[1])
elif l[0] == "include":
self.load(l[1], False)
- elif plugins.call_hook("config_option_arglines", self, l[0], l[1], arglines):
+ elif call_hook("config_option_arglines", self, l[0], l[1], arglines):
handled_arglines = True
- elif plugins.call_hook("config_option", self, l[0], l[1]):
+ elif call_hook("config_option", self, l[0], l[1]):
pass
else:
raise ConfigError("Unknown config command: " + l[0])
@@ -858,13 +1082,16 @@ class Config:
raise ConfigError("Bad argument lines in config after: " + line)
def log(self, *args):
- """If running in verbose mode, print a status message."""
+ """Print a status message. If running in verbose mode, write
+ the message to stderr; if using a logfile, write it to the
+ logfile."""
if self["verbose"]:
- if have_threading:
- self.loglock.acquire()
- print >>sys.stderr, "".join(map(str, args))
- if have_threading:
- self.loglock.release()
+ with self.loglock:
+ print >>sys.stderr, "".join(map(str, args))
+ if self.logfile is not None:
+ with self.loglock:
+ print >>self.logfile, "".join(map(str, args))
+ self.logfile.flush()
def bug(self, *args):
"""Report detection of a bug in rawdog."""
@@ -897,20 +1124,19 @@ class AddFeedEditor:
def add_feed(filename, url, rawdog, config):
"""Try to add a feed to the config file."""
- if feedfinder is None:
- feeds = [url]
- else:
- feeds = feedfinder.feeds(url)
+ feeds = rawdoglib.feedscanner.feeds(url)
if feeds == []:
print >>sys.stderr, "Cannot find any feeds in " + url
- else:
- feed = feeds[0]
- if feed in rawdog.feeds:
- print >>sys.stderr, "Feed " + feed + " is already in the config file"
- else:
- print >>sys.stderr, "Adding feed " + feed
- feedline = "feed %s %s\n" % (config["newfeedperiod"], feed)
- edit_file(filename, AddFeedEditor(feedline).edit)
+ return
+
+ feed = feeds[0]
+ if feed in rawdog.feeds:
+ print >>sys.stderr, "Feed " + feed + " is already in the config file"
+ return
+
+ print >>sys.stderr, "Adding feed " + feed
+ feedline = "feed %s %s\n" % (config["newfeedperiod"], feed)
+ edit_file(filename, AddFeedEditor(feedline).edit)
class ChangeFeedEditor:
def __init__(self, oldurl, newurl):
@@ -927,13 +1153,13 @@ class RemoveFeedEditor:
def __init__(self, url):
self.url = url
def edit(self, inputfile, outputfile):
- while 1:
+ while True:
l = inputfile.readline()
if l == "":
break
ls = l.strip().split(None)
if len(ls) > 2 and ls[0] == "feed" and ls[2] == self.url:
- while 1:
+ while True:
l = inputfile.readline()
if l == "":
break
@@ -960,63 +1186,58 @@ class FeedFetcher:
self.rawdog = rawdog
self.config = config
self.lock = threading.Lock()
- self.jobs = {}
- for feed in feedlist:
- self.jobs[feed] = 1
+ self.jobs = set(feedlist)
self.results = {}
def worker(self, num):
rawdog = self.rawdog
config = self.config
- config.log("Thread ", num, " starting")
- while 1:
- self.lock.acquire()
- if self.jobs == {}:
- job = None
- else:
- job = self.jobs.keys()[0]
- del self.jobs[job]
- self.lock.release()
- if job is None:
- break
+ while True:
+ with self.lock:
+ try:
+ job = self.jobs.pop()
+ except KeyError:
+ # No jobs left.
+ break
- config.log("Thread ", num, " fetching feed: ", job)
+ config.log("[", num, "] Fetching feed: ", job)
feed = rawdog.feeds[job]
- plugins.call_hook("pre_update_feed", rawdog, config, feed)
- self.results[job] = feed.fetch(rawdog, config)
- config.log("Thread ", num, " done")
+ call_hook("pre_update_feed", rawdog, config, feed)
+ result = feed.fetch(rawdog, config)
- def run(self, numworkers):
- self.config.log("Thread farm starting with ", len(self.jobs), " jobs")
- workers = []
- for i in range(numworkers):
- self.lock.acquire()
- isempty = (self.jobs == {})
- self.lock.release()
- if isempty:
- # No jobs left in the queue -- don't bother
- # starting any more workers.
- break
+ with self.lock:
+ self.results[job] = result
+
+ def run(self, max_workers):
+ max_workers = max(max_workers, 1)
+ num_workers = min(max_workers, len(self.jobs))
- t = threading.Thread(target = self.worker, args = (i,))
+ self.config.log("Fetching ", len(self.jobs), " feeds using ",
+ num_workers, " threads")
+ workers = []
+ for i in range(1, num_workers):
+ t = threading.Thread(target=self.worker, args=(i,))
t.start()
workers.append(t)
+ self.worker(0)
for worker in workers:
worker.join()
- self.config.log("Thread farm finished with ", len(self.results), " results")
+ self.config.log("Fetch complete")
return self.results
class FeedState(Persistable):
"""The collection of articles in a feed."""
def __init__(self):
+ Persistable.__init__(self)
self.articles = {}
class Rawdog(Persistable):
"""The aggregator itself."""
def __init__(self):
+ Persistable.__init__(self)
self.feeds = {}
self.articles = {}
self.plugin_storage = {}
@@ -1053,18 +1274,20 @@ class Rawdog(Persistable):
edit_file("config", ChangeFeedEditor(oldurl, newurl).edit)
feed = self.feeds[oldurl]
+ # Changing the URL will change the state filename as well,
+ # so we need to save the old name to load from.
old_state = feed.get_state_filename()
feed.url = newurl
del self.feeds[oldurl]
self.feeds[newurl] = feed
if config["splitstate"]:
- persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config)
- for article in feedstate.articles.values():
- article.feed = newurl
- feedstate.modified()
- save_persisted(persister, config)
- os.rename(old_state, feed.get_state_filename())
+ feedstate_p = persister.get(FeedState, old_state)
+ feedstate_p.rename(feed.get_state_filename())
+ with feedstate_p as feedstate:
+ for article in feedstate.articles.values():
+ article.feed = newurl
+ feedstate.modified()
else:
for article in self.articles.values():
if article.feed == oldurl:
@@ -1086,9 +1309,18 @@ class Rawdog(Persistable):
"""Update rawdog's internal state to match the
configuration."""
+ # Make sure the splitstate directory exists.
+ if config["splitstate"]:
+ try:
+ os.mkdir("feeds")
+ except OSError:
+ # Most likely it already exists.
+ pass
+
+ # Convert to or from splitstate if necessary.
try:
u = self.using_splitstate
- except:
+ except AttributeError:
# We were last run with a version of rawdog that didn't
# have this variable -- so we must have a single state
# file.
@@ -1099,31 +1331,29 @@ class Rawdog(Persistable):
if config["splitstate"]:
config.log("Converting to split state files")
for feed_hash, feed in self.feeds.items():
- persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config)
- feedstate.articles = {}
- for article_hash, article in self.articles.items():
- if article.feed == feed_hash:
- feedstate.articles[article_hash] = article
- feedstate.modified()
- save_persisted(persister, config)
+ with persister.get(FeedState, feed.get_state_filename()) as feedstate:
+ feedstate.articles = {}
+ for article_hash, article in self.articles.items():
+ if article.feed == feed_hash:
+ feedstate.articles[article_hash] = article
+ feedstate.modified()
self.articles = {}
else:
config.log("Converting to single state file")
self.articles = {}
for feed_hash, feed in self.feeds.items():
- persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config)
- for article_hash, article in feedstate.articles.items():
- self.articles[article_hash] = article
- feedstate.articles = {}
- feedstate.modified()
- save_persisted(persister, config)
- os.unlink(feed.get_state_filename())
+ with persister.get(FeedState, feed.get_state_filename()) as feedstate:
+ for article_hash, article in feedstate.articles.items():
+ self.articles[article_hash] = article
+ feedstate.articles = {}
+ feedstate.modified()
+ persister.delete(feed.get_state_filename())
self.modified()
self.using_splitstate = config["splitstate"]
- seenfeeds = {}
+ seen_feeds = set()
for (url, period, args) in config["feedslist"]:
- seenfeeds[url] = 1
+ seen_feeds.add(url)
if not self.feeds.has_key(url):
config.log("Adding new feed: ", url)
self.feeds[url] = Feed(url)
@@ -1141,13 +1371,10 @@ class Rawdog(Persistable):
feed.args = newargs
self.modified()
for url in self.feeds.keys():
- if not seenfeeds.has_key(url):
+ if url not in seen_feeds:
config.log("Removing feed: ", url)
if config["splitstate"]:
- try:
- os.unlink(self.feeds[url].get_state_filename())
- except OSError:
- pass
+ persister.delete(self.feeds[url].get_state_filename())
else:
for key, article in self.articles.items():
if article.feed == url:
@@ -1155,16 +1382,13 @@ class Rawdog(Persistable):
del self.feeds[url]
self.modified()
- def update(self, config, feedurl = None):
+ def update(self, config, feedurl=None):
"""Perform the update action: check feeds for new articles, and
expire old ones."""
config.log("Starting update")
now = time.time()
- feedparser._FeedParserMixin.can_contain_relative_uris = ["url"]
- feedparser._FeedParserMixin.can_contain_dangerous_markup = []
- feedparser.BeautifulSoup = None
- set_socket_timeout(config["timeout"])
+ socket.setdefaulttimeout(config["timeout"])
if feedurl is None:
update_feeds = [url for url in self.feeds.keys()
@@ -1180,14 +1404,14 @@ class Rawdog(Persistable):
numfeeds = len(update_feeds)
config.log("Will update ", numfeeds, " feeds")
- if have_threading and config["numthreads"] > 0:
- fetcher = FeedFetcher(self, update_feeds, config)
- prefetched = fetcher.run(config["numthreads"])
- else:
- prefetched = {}
+ fetcher = FeedFetcher(self, update_feeds, config)
+ fetched = fetcher.run(config["numthreads"])
- seen_some_items = {}
+ seen_some_items = set()
def do_expiry(articles):
+ """Expire articles from a list. Return True if any
+ articles were expired."""
+
feedcounts = {}
for key, article in articles.items():
url = article.feed
@@ -1209,45 +1433,45 @@ class Rawdog(Persistable):
count += 1
del articles[key]
continue
- if (seen_some_items.has_key(url)
+ if (url in seen_some_items
and self.feeds.has_key(url)
and article.can_expire(now, config)
and feedcounts[url] > self.feeds[url].get_keepmin(config)):
- plugins.call_hook("article_expired", self, config, article, now)
+ call_hook("article_expired", self, config, article, now)
count += 1
feedcounts[url] -= 1
del articles[key]
config.log("Expired ", count, " articles, leaving ", len(articles))
+ return count > 0
+
count = 0
for url in update_feeds:
count += 1
- config.log("Updating feed ", count, " of " , numfeeds, ": ", url)
+ config.log("Updating feed ", count, " of ", numfeeds, ": ", url)
feed = self.feeds[url]
if config["splitstate"]:
- persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config)
+ feedstate_p = persister.get(FeedState, feed.get_state_filename())
+ feedstate = feedstate_p.open()
articles = feedstate.articles
else:
articles = self.articles
- if url in prefetched:
- content = prefetched[url]
- else:
- plugins.call_hook("pre_update_feed", self, config, feed)
- content = feed.fetch(self, config)
- plugins.call_hook("mid_update_feed", self, config, feed, content)
+ content = fetched[url]
+ call_hook("mid_update_feed", self, config, feed, content)
rc = feed.update(self, now, config, articles, content)
url = feed.url
- plugins.call_hook("post_update_feed", self, config, feed, rc)
+ call_hook("post_update_feed", self, config, feed, rc)
if rc:
- seen_some_items[url] = 1
+ seen_some_items.add(url)
if config["splitstate"]:
feedstate.modified()
if config["splitstate"]:
- do_expiry(articles)
- save_persisted(persister, config)
+ if do_expiry(articles):
+ feedstate.modified()
+ feedstate_p.close()
if config["splitstate"]:
self.articles = {}
@@ -1257,22 +1481,25 @@ class Rawdog(Persistable):
self.modified()
config.log("Finished update")
- def get_template(self, config):
- """Get the main template."""
- if config["template"] != "default":
- return load_file(config["template"])
+ def get_template(self, config, name="page"):
+ """Return the contents of a template."""
+
+ filename = config.get(name + "template", "default")
+ if filename != "default":
+ return load_file(filename)
- template = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ if name == "page":
+ template = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<meta name="robots" content="noindex,nofollow,noarchive">
"""
- if config["userefresh"]:
- template += """__refresh__
+ if config["userefresh"]:
+ template += """__refresh__
"""
- template += """ <link rel="stylesheet" href="style.css" type="text/css">
+ template += """ <link rel="stylesheet" href="style.css" type="text/css">
<title>rawdog</title>
</head>
<body id="rawdog">
@@ -1283,13 +1510,13 @@ class Rawdog(Persistable):
__items__
</div>
"""
- if config["showfeeds"]:
- template += """<h2 id="feedstatsheader">Feeds</h2>
+ if config["showfeeds"]:
+ template += """<h2 id="feedstatsheader">Feeds</h2>
<div id="feedstats">
__feeds__
</div>
"""
- template += """<div id="footer">
+ template += """<div id="footer">
<p id="aboutrawdog">Generated by
<a href="http://offog.org/code/rawdog.html">rawdog</a>
version __version__
@@ -1298,14 +1525,9 @@ by <a href="mailto:ats@offog.org">Adam Sampson</a>.</p>
</body>
</html>
"""
- return template
-
- def get_itemtemplate(self, config):
- """Get the item template."""
- if config["itemtemplate"] != "default":
- return load_file(config["itemtemplate"])
-
- template = """<div class="item feed-__feed_hash__ feed-__feed_id__" id="item-__hash__">
+ return template
+ elif name == "item":
+ return """<div class="item feed-__feed_hash__ feed-__feed_id__" id="item-__hash__">
<p class="itemheader">
<span class="itemtitle">__title__</span>
<span class="itemfrom">[__feed_title__]</span>
@@ -1316,20 +1538,36 @@ __description__
</div>
"""
- return template
-
- def show_template(self, config):
- """Show the configured main template."""
- print self.get_template(config)
+ elif name == "feedlist":
+ return """<table id="feeds">
+<tr id="feedsheader">
+<th>Feed</th><th>RSS</th><th>Last fetched</th><th>Next fetched after</th>
+</tr>
+__feeditems__
+</table>
+"""
+ elif name == "feeditem":
+ return """
+<tr class="feedsrow">
+<td>__feed_title__</td>
+<td>__feed_icon__</td>
+<td>__feed_last_update__</td>
+<td>__feed_next_update__</td>
+</tr>
+"""
+ else:
+ raise KeyError("Unknown template name: " + name)
- def show_itemtemplate(self, config):
- """Show the configured item template."""
- print self.get_itemtemplate(config)
+ def show_template(self, name, config):
+ """Show the contents of a template, as currently configured."""
+ try:
+ print self.get_template(config, name),
+ except KeyError:
+ print >>sys.stderr, "Unknown template name: " + name
def write_article(self, f, article, config):
"""Write an article to the given file."""
feed = self.feeds[article.feed]
- feed_info = feed.feed_info
entry_info = article.entry_info
link = entry_info.get("link")
@@ -1340,7 +1578,7 @@ __description__
if guid == "":
guid = None
- itembits = {}
+ itembits = self.get_feed_bits(config, feed)
for name, value in feed.args.items():
if name.startswith("define_"):
itembits[name[7:]] = sanitise_html(value, "", True, config)
@@ -1355,7 +1593,7 @@ __description__
if key is None:
description = None
else:
- force_preformatted = feed.args.has_key("format") and (feed.args["format"] == "text")
+ force_preformatted = (feed.args.get("format", "default") == "text")
description = detail_to_html(entry_info[key], False, config, force_preformatted)
date = article.date
@@ -1379,11 +1617,6 @@ __description__
else:
itembits["title"] = '<a href="' + string_to_html(link, config) + '">' + title + '</a>'
- itembits["feed_title_no_link"] = detail_to_html(feed_info.get("title_detail"), True, config)
- itembits["feed_title"] = feed.get_html_link(config)
- itembits["feed_url"] = string_to_html(feed.url, config)
- itembits["feed_hash"] = short_hash(feed.url)
- itembits["feed_id"] = feed.get_id(config)
itembits["hash"] = short_hash(article.hash)
if description is not None:
@@ -1403,24 +1636,22 @@ __description__
else:
itembits["date"] = ""
- plugins.call_hook("output_item_bits", self, config, feed, article, itembits)
- itemtemplate = self.get_itemtemplate(config)
+ call_hook("output_item_bits", self, config, feed, article, itembits)
+ itemtemplate = self.get_template(config, "item")
f.write(fill_template(itemtemplate, itembits))
def write_remove_dups(self, articles, config, now):
"""Filter the list of articles to remove articles that are too
old or are duplicates."""
kept_articles = []
- seen_links = {}
- seen_guids = {}
+ seen_links = set()
+ seen_guids = set()
dup_count = 0
for article in articles:
feed = self.feeds[article.feed]
age = now - article.added
- maxage = config["maxage"]
- if "maxage" in feed.args:
- maxage = feed.args["maxage"]
+ maxage = feed.args.get("maxage", config["maxage"])
if maxage != 0 and age > maxage:
continue
@@ -1434,19 +1665,17 @@ __description__
if guid == "":
guid = None
- if feed.args.get("allowduplicates") != "true":
+ if not feed.args.get("allowduplicates", False):
is_dup = False
for key in config["hideduplicates"]:
if key == "id" and guid is not None:
- if seen_guids.has_key(guid):
+ if guid in seen_guids:
is_dup = True
- seen_guids[guid] = 1
- break
+ seen_guids.add(guid)
elif key == "link" and link is not None:
- if seen_links.has_key(link):
+ if link in seen_links:
is_dup = True
- seen_links[link] = 1
- break
+ seen_links.add(link)
if is_dup:
dup_count += 1
continue
@@ -1454,37 +1683,56 @@ __description__
kept_articles.append(article)
return (kept_articles, dup_count)
+ def get_feed_bits(self, config, feed):
+ """Get the bits that are used to describe a feed."""
+
+ bits = {}
+ bits["feed_id"] = feed.get_id(config)
+ bits["feed_hash"] = short_hash(feed.url)
+ bits["feed_title"] = feed.get_html_link(config)
+ bits["feed_title_no_link"] = detail_to_html(feed.feed_info.get("title_detail"), True, config)
+ bits["feed_url"] = string_to_html(feed.url, config)
+ bits["feed_icon"] = '<a class="xmlbutton" href="' + cgi.escape(feed.url) + '">XML</a>'
+ bits["feed_last_update"] = format_time(feed.last_update, config)
+ bits["feed_next_update"] = format_time(feed.last_update + feed.period, config)
+ return bits
+
+ def write_feeditem(self, f, feed, config):
+ """Write a feed list item."""
+ bits = self.get_feed_bits(config, feed)
+ f.write(fill_template(self.get_template(config, "feeditem"), bits))
+
+ def write_feedlist(self, f, config):
+ """Write the feed list."""
+ bits = {}
+
+ feeds = [(feed.get_html_name(config).lower(), feed)
+ for feed in self.feeds.values()]
+ feeds.sort()
+
+ feeditems = StringIO()
+ for key, feed in feeds:
+ self.write_feeditem(feeditems, feed, config)
+ bits["feeditems"] = feeditems.getvalue()
+ feeditems.close()
+
+ f.write(fill_template(self.get_template(config, "feedlist"), bits))
+
def get_main_template_bits(self, config):
"""Get the bits that are used in the default main template,
with the exception of items and num_items."""
- bits = { "version" : VERSION }
+ bits = {"version": VERSION}
bits.update(config["defines"])
- refresh = config["expireage"]
- for feed in self.feeds.values():
- if feed.period < refresh: refresh = feed.period
-
- bits["refresh"] = """<meta http-equiv="Refresh" """ + 'content="' + str(refresh) + '"' + """>"""
+ refresh = min([config["expireage"]]
+ + [feed.period for feed in self.feeds.values()])
+ bits["refresh"] = '<meta http-equiv="Refresh" content="' + str(refresh) + '">'
f = StringIO()
- print >>f, """<table id="feeds">
-<tr id="feedsheader">
-<th>Feed</th><th>RSS</th><th>Last fetched</th><th>Next fetched after</th>
-</tr>"""
- feeds = [(feed.get_html_name(config).lower(), feed)
- for feed in self.feeds.values()]
- feeds.sort()
- for (key, feed) in feeds:
- print >>f, '<tr class="feedsrow">'
- print >>f, '<td>' + feed.get_html_link(config) + '</td>'
- print >>f, '<td><a class="xmlbutton" href="' + cgi.escape(feed.url) + '">XML</a></td>'
- print >>f, '<td>' + format_time(feed.last_update, config) + '</td>'
- print >>f, '<td>' + format_time(feed.last_update + feed.period, config) + '</td>'
- print >>f, '</tr>'
- print >>f, """</table>"""
+ self.write_feedlist(f, config)
bits["feeds"] = f.getvalue()
f.close()
- bits["num_feeds"] = str(len(feeds))
+ bits["num_feeds"] = str(len(self.feeds))
return bits
@@ -1492,23 +1740,23 @@ __description__
"""Write a regular rawdog HTML output file."""
f = StringIO()
dw = DayWriter(f, config)
- plugins.call_hook("output_items_begin", self, config, f)
+ call_hook("output_items_begin", self, config, f)
for article in articles:
- if not plugins.call_hook("output_items_heading", self, config, f, article, article_dates[article]):
+ if not call_hook("output_items_heading", self, config, f, article, article_dates[article]):
dw.time(article_dates[article])
self.write_article(f, article, config)
dw.close()
- plugins.call_hook("output_items_end", self, config, f)
+ call_hook("output_items_end", self, config, f)
bits = self.get_main_template_bits(config)
bits["items"] = f.getvalue()
f.close()
bits["num_items"] = str(len(articles))
- plugins.call_hook("output_bits", self, config, bits)
- s = fill_template(self.get_template(config), bits)
+ call_hook("output_bits", self, config, bits)
+ s = fill_template(self.get_template(config, "page"), bits)
outputfile = config["outputfile"]
if outputfile == "-":
write_ascii(sys.stdout, s, config)
@@ -1530,14 +1778,13 @@ __description__
if config["splitstate"]:
article_list = []
for feed in self.feeds.values():
- persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config)
- article_list += list_articles(feedstate.articles)
- save_persisted(persister, config)
+ with persister.get(FeedState, feed.get_state_filename()) as feedstate:
+ article_list += list_articles(feedstate.articles)
else:
article_list = list_articles(self.articles)
numarticles = len(article_list)
- if not plugins.call_hook("output_sort_articles", self, config, article_list):
+ if not call_hook("output_sort_articles", self, config, article_list):
article_list.sort()
if config["maxarticles"] != 0:
@@ -1558,10 +1805,9 @@ __description__
found = {}
for (feed_url, article_hashes) in wanted.items():
feed = self.feeds[feed_url]
- persister, feedstate = load_persisted(feed.get_state_filename(), FeedState, config)
- for hash in article_hashes:
- found[hash] = feedstate.articles[hash]
- save_persisted(persister, config)
+ with persister.get(FeedState, feed.get_state_filename()) as feedstate:
+ for hash in article_hashes:
+ found[hash] = feedstate.articles[hash]
else:
found = self.articles
@@ -1573,16 +1819,16 @@ __description__
articles.append(a)
article_dates[a] = -date
- plugins.call_hook("output_write", self, config, articles)
+ call_hook("output_write", self, config, articles)
- if not plugins.call_hook("output_sorted_filter", self, config, articles):
+ if not call_hook("output_sorted_filter", self, config, articles):
(articles, dup_count) = self.write_remove_dups(articles, config, now)
else:
dup_count = 0
config.log("Selected ", len(articles), " of ", numarticles, " articles to write; ignored ", dup_count, " duplicates")
- if not plugins.call_hook("output_write_files", self, config, articles, article_dates):
+ if not call_hook("output_write_files", self, config, articles, article_dates):
self.write_output_file(articles, article_dates, config)
config.log("Finished write")
@@ -1594,75 +1840,66 @@ Usage: rawdog [OPTION]...
General options (use only once):
-d|--dir DIR Use DIR instead of ~/.rawdog
--v, --verbose Print more detailed status information
-N, --no-locking Do not lock the state file
+-v, --verbose Print more detailed status information
+-V|--log FILE Append detailed status information to FILE
-W, --no-lock-wait Exit silently if state file is locked
---help Display this help and exit
Actions (performed in order given):
--u, --update Fetch data from feeds and store it
--l, --list List feeds known at time of last update
--w, --write Write out HTML output
--f|--update-feed URL Force an update on the single feed URL
--c|--config FILE Read additional config file FILE
--t, --show-template Print the template currently in use
--T, --show-itemtemplate Print the item template currently in use
-a|--add URL Try to find a feed associated with URL and
add it to the config file
+-c|--config FILE Read additional config file FILE
+-f|--update-feed URL Force an update on the single feed URL
+-l, --list List feeds known at time of last update
-r|--remove URL Remove feed URL from the config file
+-s|--show TEMPLATE Show the contents of a template
+ (TEMPLATE may be: page item feedlist feeditem)
+-u, --update Fetch data from feeds and store it
+-w, --write Write out HTML output
Special actions (all other options are ignored if one of these is specified):
---upgrade OLDDIR NEWDIR Import feed state from rawdog 1.x directory
- OLDDIR into rawdog 2.x directory NEWDIR
+--dump URL Show what rawdog's parser returns for URL
+--help Display this help and exit
Report bugs to <ats@offog.org>."""
-def load_persisted(fn, klass, config, no_block = False):
- """Attempt to load a persisted object. Returns the persister and the
- object."""
- config.log("Loading state file: ", fn)
- persister = Persister(fn, klass, config.locking)
- try:
- obj = persister.load(no_block = no_block)
- except KeyboardInterrupt:
- sys.exit(1)
- except:
- print "An error occurred while reading state from " + os.getcwd() + "/" + fn + "."
- print "This usually means the file is corrupt, and removing it will fix the problem."
- sys.exit(1)
- return (persister, obj)
-
-def save_persisted(persister, config):
- if persister.object.is_modified():
- config.log("Saving state file: ", persister.filename)
- persister.save()
-
def main(argv):
"""The command-line interface to the aggregator."""
locale.setlocale(locale.LC_ALL, "")
+ # This is quite expensive and not threadsafe, so we do it on
+ # startup and cache the result.
global system_encoding
- try:
- # This doesn't exist on Python 2.2.
- # It's also quite expensive, which is why we do it on startup
- # and cache the result.
- system_encoding = locale.getpreferredencoding()
- except:
- system_encoding = "UTF-8"
+ system_encoding = locale.getpreferredencoding()
try:
- (optlist, args) = getopt.getopt(argv, "ulwf:c:tTd:va:r:NW", ["update", "list", "write", "update-feed=", "help", "config=", "show-template", "dir=", "show-itemtemplate", "verbose", "upgrade", "add=", "remove=", "no-locking", "no-lock-wait"])
+ SHORTOPTS = "a:c:d:f:lNr:s:tTuvV:wW"
+ LONGOPTS = [
+ "add=",
+ "config=",
+ "dir=",
+ "dump=",
+ "help",
+ "list",
+ "log=",
+ "no-lock-wait",
+ "no-locking",
+ "remove=",
+ "show=",
+ "show-itemtemplate",
+ "show-template",
+ "update",
+ "update-feed=",
+ "verbose",
+ "write",
+ ]
+ (optlist, args) = getopt.getopt(argv, SHORTOPTS, LONGOPTS)
except getopt.GetoptError, s:
print s
usage()
return 1
- for o, a in optlist:
- if o == "--upgrade" and len(args) == 2:
- import upgrade_1_2
- return upgrade_1_2.upgrade(args[0], args[1])
-
if len(args) != 0:
usage()
return 1
@@ -1671,21 +1908,28 @@ def main(argv):
statedir = os.environ["HOME"] + "/.rawdog"
else:
statedir = None
- verbose = 0
- locking = 1
- no_lock_wait = 0
+ verbose = False
+ logfile_name = None
+ locking = True
+ no_lock_wait = False
for o, a in optlist:
- if o == "--help":
+ if o == "--dump":
+ import pprint
+ pprint.pprint(feedparser.parse(a, agent=HTTP_AGENT))
+ return 0
+ elif o == "--help":
usage()
return 0
elif o in ("-d", "--dir"):
statedir = a
- elif o in ("-v", "--verbose"):
- verbose = 1
elif o in ("-N", "--no-locking"):
- locking = 0
+ locking = False
+ elif o in ("-v", "--verbose"):
+ verbose = True
+ elif o in ("-V", "--log"):
+ logfile_name = a
elif o in ("-W", "--no-lock-wait"):
- no_lock_wait = 1
+ no_lock_wait = True
if statedir is None:
print "$HOME not set and state dir not explicitly specified; please use -d/--dir"
return 1
@@ -1698,7 +1942,7 @@ def main(argv):
sys.path.append(".")
- config = Config(locking)
+ config = Config(locking, logfile_name)
def load_config(fn):
try:
config.load(fn)
@@ -1708,9 +1952,16 @@ def main(argv):
return 1
if verbose:
config["verbose"] = True
- load_config("config")
+ return 0
+ rc = load_config("config")
+ if rc != 0:
+ return rc
- persister, rawdog = load_persisted("state", Rawdog, config, no_lock_wait)
+ global persister
+ persister = Persister(config)
+
+ rawdog_p = persister.get(Rawdog, "state")
+ rawdog = rawdog_p.open(no_block=no_lock_wait)
if rawdog is None:
return 0
if not rawdog.check_state_version():
@@ -1721,36 +1972,39 @@ def main(argv):
rawdog.sync_from_config(config)
- plugins.call_hook("startup", rawdog, config)
+ call_hook("startup", rawdog, config)
for o, a in optlist:
- if o in ("-u", "--update"):
- rawdog.update(config)
+ if o in ("-a", "--add"):
+ add_feed("config", a, rawdog, config)
+ config.reload()
+ rawdog.sync_from_config(config)
+ elif o in ("-c", "--config"):
+ rc = load_config(a)
+ if rc != 0:
+ return rc
+ rawdog.sync_from_config(config)
elif o in ("-f", "--update-feed"):
rawdog.update(config, a)
elif o in ("-l", "--list"):
rawdog.list(config)
- elif o in ("-w", "--write"):
- rawdog.write(config)
- elif o in ("-c", "--config"):
- load_config(a)
- rawdog.sync_from_config(config)
- elif o in ("-t", "--show-template"):
- rawdog.show_template(config)
- elif o in ("-T", "--show-itemtemplate"):
- rawdog.show_itemtemplate(config)
- elif o in ("-a", "--add"):
- add_feed("config", a, rawdog, config)
- config.reload()
- rawdog.sync_from_config(config)
elif o in ("-r", "--remove"):
remove_feed("config", a, config)
config.reload()
rawdog.sync_from_config(config)
+ elif o in ("-s", "--show"):
+ rawdog.show_template(a, config)
+ elif o in ("-t", "--show-template"):
+ rawdog.show_template("page", config)
+ elif o in ("-T", "--show-itemtemplate"):
+ rawdog.show_template("item", config)
+ elif o in ("-u", "--update"):
+ rawdog.update(config)
+ elif o in ("-w", "--write"):
+ rawdog.write(config)
- plugins.call_hook("shutdown", rawdog, config)
+ call_hook("shutdown", rawdog, config)
- save_persisted(persister, config)
+ rawdog_p.close()
return 0
-