twitterのログを自動的にバックアップするGAEアプリ(cronを使ってみる)

twitterのログを長期間残しておいて、いつでもアクセス可能にしたいな―と思ったので丁度GAEがcronに対応したことだし、とアプリにしてみた。


http://rubyu-twitterlog.appspot.com/
実際に動いてるところ。


cronで10分ごとに
http://twitter.com/statuses/user_timeline.xml
から、最後に取得したIDより大きいstatusを取ってくる。
んでデータストアに入れといて、あとは適当なviewから参照するだけ。



特に難しいことはしてないが、

  • python2.6だとxmlパーサのあたりでエラーが出る
  • cronが開始されるまでラグがある

あたりでちょっと時間を食った。しかしこれでハードにtwitterを使える!



例によってソースをぺたり。
tw_archiver.py

#!-*- coding:utf-8 -*-

import wsgiref.handlers
import logging
from datetime import datetime
import os
import re

from google.appengine.ext import webapp
from google.appengine.api import urlfetch
from google.appengine.api import users
from google.appengine.ext import db
from google.appengine.ext.webapp import template

import base64
from xml.etree.ElementTree import fromstring

template.register_template_library('custom_filters')

url_timeline = 'http://twitter.com/statuses/user_timeline.xml'

indexTmplPath   = os.path.join(os.path.dirname(__file__), 'index.html')

#settings
username = 'hogehoge'
password = 'hogehoge'
limit = 100

class Status(db.Model):
    index    = db.IntegerProperty(required=True)
    id       = db.IntegerProperty(required=True)
    source   = db.StringProperty(required=True)
    created  = db.DateTimeProperty(required=True)
    text     = db.TextProperty(required=True)

def str2dt(str):
    return datetime.strptime(str, '%a %b %d %H:%M:%S +0000 %Y')

def getTimeLine(sinceid):    
    url = '%s?since_id=%s'% (url_timeline, sinceid)
    base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
    headers = {'Authorization': "Basic %s" % base64string} 
    result = urlfetch.fetch(url, method=urlfetch.GET, headers=headers)
    elem = fromstring(result.content)
    statuses = elem.findall('.//status')
    results = []
    for status in statuses:
        id      = int( status.findtext('./id') )
        text    = status.findtext('./text')
        source  = status.findtext('./source')
        created = str2dt( status.findtext('./created_at') )
        results.append({'id': id,
                        'text': text, 
                        'source': source,
                        'created': created
                        })
    results.reverse()
    return results

def getPages(page):
    maxTicks = 30
    tickStep = 1
    downTicks = []
    upTicks   = []
    for i in xrange(1, (maxTicks / 2) + 1):
        d = tickStep * i
        n = page - d
        if 0 < n:
            downTicks.append(n)
        upTicks.append(page + d)
    downTicks.reverse()
    return {'up': upTicks,
            'down': downTicks }

def getState():
    sinceid  = 1
    maxindex = -1
    statuses = Status.gql('ORDER BY index DESC LIMIT 1')
    if statuses:
        for status in statuses:
            sinceid  = status.id
            maxindex = status.index
            break
    logging.info('sinceid: %s'  % sinceid)
    logging.info('maxindex: %s' % maxindex)
    return (sinceid, maxindex)

class MainHandler(webapp.RequestHandler):
    def get(self, page):
        if None == page:
            page = 1
        page = self.v_page(page)
        if not page:
            self.redirect('/')
            return
        
        (sinceid, maxindex) = getState()
        offset = maxindex - (page - 1) * limit
        statuses = Status.gql('WHERE index <= :1 ORDER BY index DESC', offset).fetch(limit)
        temp = []
        for status in statuses:
            temp.append({'id': status.id,
                         'text': status.text,
                         'source': status.source,
                         'created': status.created })
        statuses = temp
        self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
        self.response.out.write(template.render(indexTmplPath, {'username': username,
                                                                'statuses': statuses,
                                                                'page': page,
                                                                'pages': getPages(page), 
                                                                }))
        
    def v_page(self, n):
        try:
            n = int(n)
            if 0 < n:
                return n
        except:
            logging.warning('v_page() fail.')
            return


class FetchHandler(webapp.RequestHandler):
    def get(self):
        (sinceid, maxindex) = getState()
        
        statuses = getTimeLine(sinceid)
        def addStatus(key, index, id, text, source, created):
            obj = db.get( db.Key.from_path( "Status", key ) )
            if not obj:
                obj = Status(key_name=key,
                             index=index,
                             id=id,
                             text=text,
                             source=source,
                             created=created )
                obj.put()
        for status in statuses:
            try:
                maxindex += 1
                id  = status['id']
                key = 's%s' % id
                db.run_in_transaction(addStatus, key, maxindex, id, status['text'], status['source'], status['created'])
                logging.info( 'add status: %s(%s, %s)' % (status['text'], maxindex, id) )
            except Exception, e:
                logging.error( 'add fail: %s' % e )
                self.response.headers['Content-Type'] = 'text/html'
                self.response.out.write('fetch() fail.')
                return #失敗したらそこで終了 後は次にまかせる
        
        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write('fetch() OK.')

def main():
    application = webapp.WSGIApplication([
                                          ('^/(\d+)?', MainHandler),
                                          ('^/fetch', FetchHandler),
                                          ],
                                        #debug=True)
                                        debug=False)
                                        
    wsgiref.handlers.CGIHandler().run(application)

if __name__ == "__main__":
    main()


index.html

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<head>
<html>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>tw-archiver</title>
<style type="text/css">
//css部分省略
</style>
</head>
<body>

<div id="main">
<div id="header">
archive of <a href="http://www.twitter.com/{{ username }}">http://www.twitter.com/{{ username }}</a>
</div>

<div class="pages top">
{% for p in pages.down %}
<a href="/{{ p }}">{{ p }}</a>
{% endfor %}
{{ page }}
{% for p in pages.up %}
<a href="/{{ p }}">{{ p }}</a>
{% endfor %}
</div>


{% for status in statuses %}
<div class="status">
	<div class="text">{{ status.text|autolink }}</div>
	<div class="created">{{ status.created|JST }}</div>
	<div class="id">[id:{{ status.id }}]</div>
	<div class="source">from {{ status.source }}</div>
</div>
{% endfor %}

<div class="pages bottom">
{% for p in pages.down %}
<a href="/{{ p }}">{{ p }}</a>
{% endfor %}
{{ page }}
{% for p in pages.up %}
<a href="/{{ p }}">{{ p }}</a>
{% endfor %}
</div>

</div>
</body></html>


custom_filters.py

import re
import datetime
from google.appengine.ext.webapp import template

register = template.create_template_register()

@register.filter
def JST(time):
    return ( time + datetime.timedelta(hours = 9) ).strftime('%y/%m/%d %H:%M:%S')

re_autolink = re.compile( r'(http://[^\s]+)' )
@register.filter
def autolink(str):
    return re.sub(re_autolink, r'<a href="\1">\1</a>', str)



cron.yaml

cron:
- description: fetch twitter
  url: /fetch
  schedule: every 10 minutes



app.yaml

application: rubyu-twitterlog
version: 1
runtime: python
api_version: 1

handlers:
- url: /(\d+)?
  script: tw_archiver.py

- url: /fetch
  script: tw_archiver.py
  login: admin