вот, насобирал. требуется html2text.py и feedparser.py
# -*- coding: utf-8 -*-
import urllib, os, base64, hashlib, time, html2text, feedparser as feed
RSSURL='http://www.linux.org.ru/news/linux-general/10737394?output=rss'
ECHO='re.14'
def hsh(s):
return base64.urlsafe_b64encode( hashlib.sha256(s).digest() ).replace('-','A').replace('_','z')[:20]
def ru(fn):
try: return open(fn).read().decode('utf-8')
except: return ''
def gts():
return int(time.time())
def newmsg(ea,msgfrom,addr,msgto,subj,txt):
s = 'ii/ok\n%s\n%s\n%s\n%s\n%s\n%s\n\n%s' % (ea,gts(),msgfrom,addr,msgto,subj,txt)
h = hsh(s)
if len(s) < 65536:
open('msg/%s' % h,'wb').write(s)
open('echo/%s' % ea,'ab').write(h + '\n')
return h
def getf(l):
print 'fetch %s' % l
from StringIO import StringIO
import gzip
request = urllib2.Request(l)
request.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(request)
if response.info().get('Content-Encoding') == 'gzip':
f = gzip.GzipFile(fileobj=StringIO( response.read()))
else:
f = response
return f.read()
def parse_news_msgs(rurl,ea):
o = feed.parse(rurl)
prev = ru('rssdb/%s' % ea).splitlines()
for q in reversed(o.entries):
if not q.guid in prev:
h = html2text.HTML2Text()
h.body_width=0
h.ignore_links = False
h.ignore_images = True
txt = h.handle(q.description).encode('utf-8')
newmsg(ea,q.author.encode('utf-8') or 'rss','rssarea,1','All',q.title.encode('utf-8'),txt)
open('rssdb/%s' % ea,'a').write('%s\n' % q.guid)
parse_news_msgs(RSSURL,ECHO)
ещё для работы требуются пустые каталоги rssdb, msg и echo