twitterのログを長期間残しておいて、いつでもアクセス可能にしたいな―と思ったので丁度GAEがcronに対応したことだし、とアプリにしてみた。
http://rubyu-twitterlog.appspot.com/
実際に動いてるところ。
cronで10分ごとに
http://twitter.com/statuses/user_timeline.xml
から、最後に取得したIDより大きいstatusを取ってくる。
んでデータストアに入れといて、あとは適当なviewから参照するだけ。
特に難しいことはしてないが、
- python2.6だとxmlパーサのあたりでエラーが出る
- cronが開始されるまでラグがある
あたりでちょっと時間を食った。しかしこれでハードにtwitterを使える!
例によってソースをぺたり。
tw_archiver.py
#!-*- coding:utf-8 -*- import wsgiref.handlers import logging from datetime import datetime import os import re from google.appengine.ext import webapp from google.appengine.api import urlfetch from google.appengine.api import users from google.appengine.ext import db from google.appengine.ext.webapp import template import base64 from xml.etree.ElementTree import fromstring template.register_template_library('custom_filters') url_timeline = 'http://twitter.com/statuses/user_timeline.xml' indexTmplPath = os.path.join(os.path.dirname(__file__), 'index.html') #settings username = 'hogehoge' password = 'hogehoge' limit = 100 class Status(db.Model): index = db.IntegerProperty(required=True) id = db.IntegerProperty(required=True) source = db.StringProperty(required=True) created = db.DateTimeProperty(required=True) text = db.TextProperty(required=True) def str2dt(str): return datetime.strptime(str, '%a %b %d %H:%M:%S +0000 %Y') def getTimeLine(sinceid): url = '%s?since_id=%s'% (url_timeline, sinceid) base64string = base64.encodestring('%s:%s' % (username, password))[:-1] headers = {'Authorization': "Basic %s" % base64string} result = urlfetch.fetch(url, method=urlfetch.GET, headers=headers) elem = fromstring(result.content) statuses = elem.findall('.//status') results = [] for status in statuses: id = int( status.findtext('./id') ) text = status.findtext('./text') source = status.findtext('./source') created = str2dt( status.findtext('./created_at') ) results.append({'id': id, 'text': text, 'source': source, 'created': created }) results.reverse() return results def getPages(page): maxTicks = 30 tickStep = 1 downTicks = [] upTicks = [] for i in xrange(1, (maxTicks / 2) + 1): d = tickStep * i n = page - d if 0 < n: downTicks.append(n) upTicks.append(page + d) downTicks.reverse() return {'up': upTicks, 'down': downTicks } def getState(): sinceid = 1 maxindex = -1 statuses = Status.gql('ORDER BY index DESC LIMIT 1') if statuses: for status in statuses: sinceid = status.id maxindex = status.index break logging.info('sinceid: %s' % sinceid) logging.info('maxindex: %s' % maxindex) return (sinceid, maxindex) class MainHandler(webapp.RequestHandler): def get(self, page): if None == page: page = 1 page = self.v_page(page) if not page: self.redirect('/') return (sinceid, maxindex) = getState() offset = maxindex - (page - 1) * limit statuses = Status.gql('WHERE index <= :1 ORDER BY index DESC', offset).fetch(limit) temp = [] for status in statuses: temp.append({'id': status.id, 'text': status.text, 'source': status.source, 'created': status.created }) statuses = temp self.response.headers['Content-Type'] = 'text/html; charset=utf-8' self.response.out.write(template.render(indexTmplPath, {'username': username, 'statuses': statuses, 'page': page, 'pages': getPages(page), })) def v_page(self, n): try: n = int(n) if 0 < n: return n except: logging.warning('v_page() fail.') return class FetchHandler(webapp.RequestHandler): def get(self): (sinceid, maxindex) = getState() statuses = getTimeLine(sinceid) def addStatus(key, index, id, text, source, created): obj = db.get( db.Key.from_path( "Status", key ) ) if not obj: obj = Status(key_name=key, index=index, id=id, text=text, source=source, created=created ) obj.put() for status in statuses: try: maxindex += 1 id = status['id'] key = 's%s' % id db.run_in_transaction(addStatus, key, maxindex, id, status['text'], status['source'], status['created']) logging.info( 'add status: %s(%s, %s)' % (status['text'], maxindex, id) ) except Exception, e: logging.error( 'add fail: %s' % e ) self.response.headers['Content-Type'] = 'text/html' self.response.out.write('fetch() fail.') return #失敗したらそこで終了 後は次にまかせる self.response.headers['Content-Type'] = 'text/html' self.response.out.write('fetch() OK.') def main(): application = webapp.WSGIApplication([ ('^/(\d+)?', MainHandler), ('^/fetch', FetchHandler), ], #debug=True) debug=False) wsgiref.handlers.CGIHandler().run(application) if __name__ == "__main__": main()
index.html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <head> <html> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <title>tw-archiver</title> <style type="text/css"> //css部分省略 </style> </head> <body> <div id="main"> <div id="header"> archive of <a href="http://www.twitter.com/{{ username }}">http://www.twitter.com/{{ username }}</a> </div> <div class="pages top"> {% for p in pages.down %} <a href="/{{ p }}">{{ p }}</a> {% endfor %} {{ page }} {% for p in pages.up %} <a href="/{{ p }}">{{ p }}</a> {% endfor %} </div> {% for status in statuses %} <div class="status"> <div class="text">{{ status.text|autolink }}</div> <div class="created">{{ status.created|JST }}</div> <div class="id">[id:{{ status.id }}]</div> <div class="source">from {{ status.source }}</div> </div> {% endfor %} <div class="pages bottom"> {% for p in pages.down %} <a href="/{{ p }}">{{ p }}</a> {% endfor %} {{ page }} {% for p in pages.up %} <a href="/{{ p }}">{{ p }}</a> {% endfor %} </div> </div> </body></html>
custom_filters.py
import re import datetime from google.appengine.ext.webapp import template register = template.create_template_register() @register.filter def JST(time): return ( time + datetime.timedelta(hours = 9) ).strftime('%y/%m/%d %H:%M:%S') re_autolink = re.compile( r'(http://[^\s]+)' ) @register.filter def autolink(str): return re.sub(re_autolink, r'<a href="\1">\1</a>', str)
cron.yaml
cron: - description: fetch twitter url: /fetch schedule: every 10 minutes
app.yaml
application: rubyu-twitterlog version: 1 runtime: python api_version: 1 handlers: - url: /(\d+)? script: tw_archiver.py - url: /fetch script: tw_archiver.py login: admin