scrapbookのデータディレクトリのインデックスを作る

scrapIndexerが吐いたhtmlがいまいち使いづらかったのでpythonで適当に書いた。
14MBのhtmlとか出力してくれてうわーってなった＞scrapIndexer

ディレクトリ構造は無視
scrapbook.rdfは見ない
ので高速
指定の最大アイテム数ごとにページを分割して吐く
ソースURLへのリンクも表示

ぐらいが差異だと思う。

タイトルやソースURLは、〜\data\〜\index.datにそれぞれのデータがあるのでそれを見ている。
破損とか、保存されてるデータの一部だけコピーしたとかそういう場合を想定するとscrapbook.rdfは使えない。

履歴

リリース
faviconがあれば表示するように

#!-*- coding:utf-8 -*-

"""
scrap_index_gen.py
    scrapbookのデータディレクトリのインデックスを生成します。
    
引数
    1. path
        データディレクトリのパスを指定してください。
        〜\scrapbookなど、scrapbook.rdfが存在するディレクトリです。
        〜\scrapbook\dataではありません。
"""

import sys
import os
import codecs
import re
import datetime
import math


#settings
itemPP = 400                #item per page
putname = 'scrap-i_%s.html' #


now = datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S')
errorStr_read_index_dat = 'readerror(index.dat)'

def toJavaPath(path):
    drive = path[0].upper()
    javapath = drive + path[1:]
    javapath.replace('\\', '/')
    return 'file:///' + javapath

re_dat = re.compile(r'^([^\t]+)\t(.+)$')
def read_index_dat(path):
    dat = {}
    try:
        lines = codecs.open(path, 'r', 'utf-8')
        for line in lines:
            m = re.search(re_dat, line)
            if m:
                key  = m.group(1)
                data = m.group(2)
                dat[key] = data
        return dat
    except:
        return {}

def put(maxPage, nowPage, arr, putdir):
    parent, start = os.path.split(arr[0])
    parent, end   = os.path.split(arr[-1])
    html = []
    html.append( ''.join([
        """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\n"""
        """<html>\n"""
        """<head>\n"""
            """<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n""",
            """<title>scrap-i [%s/%s] %s</title>\n""" % (nowPage, maxPage, now),
            """<style type='text/css'>\n"""
                """div.gentime{ text-align:right; }\n"""
                """div.start2end{ text-align:right; font-size:x-large; margin:10px;}\n"""
                """div.item{margin:5px; font-size:small;}\n"""
                """div.item a{margin-left:10px;}\n"""
                """div.item img{height:16px; margin-left:10px;}\n"""
                """div.pbar{margin-bottom:10px; border-bottom:1px solid silver;}\n"""
                """div.pbar span, div.pbar a{margin:5px; font-size:small;}\n"""
                """div.pbar span{font-size:x-large;}\n"""
            """</style>\n"""
        """</head>\n"""
        """<body>\n""",
        """<div class="gentime">generated on %s</div>\n""" % now,
        """<div class="start2end">%s - %s</div>\n""" % (start, end)
        ]) )
    
    pbar = []
    pbar.append("""<div class="pbar">""")
    for i in xrange(1, maxPage + 1):
        if i == nowPage:
            pbar.append("""<span class="now">%s</span>""" % i)
        else:
            pbar.append("""<a href="%s">%s</a>""" % ( (putname % i), i) )
    pbar.append("""</div>\n""")
    pbar = ''.join(pbar)
    
    html.append(pbar)
    for d in arr:
        datPath     = os.path.join(d, 'index.dat')
        linkPath    = os.path.join(d, 'index.html')
        faviconPath = os.path.join(d, 'favicon.ico')
        
        if os.path.exists(faviconPath):
            favicon = '<img src="%s" />' % toJavaPath(faviconPath)
        else:
            favicon = ''
        
        dat = read_index_dat(datPath)
        if dat.has_key('title'):
            title = """<a href="%s">%s</a>""" % (toJavaPath(linkPath), dat['title'])
        else:
            title = """<a href="%s">%s</a>""" % (toJavaPath(linkPath), errorStr_read_index_dat)
        if dat.has_key('source'):
            source = u"""<a href="%s">ソースURLを開く</a>""" % dat['source']
        else:
            source = ''
        if dat.has_key('id'):
            id = """<span class="id">[%s]</span>""" % dat['id']
        else:
            id = """<span class="id">[%s]</span>""" % errorStr_read_index_dat
        
        html.append("""<div class="item">""")
        html.append(id)
        html.append(favicon)
        html.append(title)
        html.append(source)
        html.append("""</div>\n""")

    html.append(pbar)
    html.append("""</body>\n</html>\n""")
    html = ''.join(html)
    codecs.open(os.path.join(putdir, putname % nowPage), 'w', 'utf-8').write(html)
        
def gen(itemPP, path):
    datadir = os.path.join(path, 'data')
    list = os.listdir(datadir)
    dir = []
    for i in list:
        i = os.path.join(datadir, i)
        if os.path.isdir(i):
            dir.append(i)
    dir.sort()
    maxPage = int(math.ceil(float(len(dir)) / itemPP))
    for i in xrange(maxPage):
        put(maxPage, i + 1, dir[i * itemPP:(i + 1) * itemPP], path)

if __name__ == '__main__':
    argvs = sys.argv
    if 2 == len(argvs):
        path = argvs[1]
        gen(itemPP, path)