javaに組み込んで使う場合、
http://lobobrowser.org/cobra/getting-started.jsp
BareMinimumTest.java
import javax.swing.*; import org.lobobrowser.html.gui.*; import org.lobobrowser.html.test.*; public class BareMinimumTest { public static void main(String[] args) throws Exception { JFrame window = new JFrame(); HtmlPanel panel = new HtmlPanel(); window.getContentPane().add(panel); window.setSize(600, 400); window.setVisible(true); new SimpleHtmlRendererContext(panel, new SimpleUserAgentContext()) .navigate("http://lobobrowser.org/browser/home.jsp"); } }
のようにとても簡単に使えるようになってるレンダラなのだが文字コード判定が全然入ってないっぽいので適当に修正
DocumentBuilderImpl.java内のcreateDocument()を
public Document createDocument(InputSource is) throws SAXException, IOException { String uri = is.getSystemId(); if(uri == null) { logger.warning("parse(): InputSource has no SystemId (URI); document item URLs will not be resolvable."); } WritableLineReader wis; Reader reader = is.getCharacterStream(); if(reader != null) { wis = new WritableLineReader(reader); } else { String charset = null; InputStream in = is.getByteStream(); if(null == in && null == uri) { throw new IllegalArgumentException("The InputSource must have either a reader, an input stream or a URI."); } if(null != uri){ java.net.URLConnection connection = new java.net.URL(uri).openConnection(); if(null == in) { in = connection.getInputStream(); } String contentType = connection.getContentType(); logger.info("contentType->" + contentType); if(null != contentType){ int pos =contentType.indexOf("charset="); if(-1 != pos){ charset = contentType.substring(pos + 8); charset = charset.replaceAll("^([\\w\\-]*)?.*", "$1"); charset = normalizer.get(charset); } } } ByteArrayOutputStream b = new ByteArrayOutputStream(); OutputStream os = new BufferedOutputStream(b); int c; try { while ((c = in.read()) != -1) { os.write(c); } } catch (IOException e) { e.printStackTrace(); } finally { if (os != null) { try { os.flush(); os.close(); } catch (IOException e) { e.printStackTrace(); } } } byte[] bytes = b.toByteArray(); if(null == charset){ String str = ""; BufferedReader bReader; bReader = new BufferedReader( new InputStreamReader( new ByteArrayInputStream(bytes))); try{ while(true){ String line = bReader.readLine(); if(null == line) break; str += line; //適当な文字数で終了する方がいい } }catch(Exception ex){ ex.printStackTrace(); }finally{ bReader.close(); } Pattern p = Pattern.compile("<head>(.*)?</head>"); Matcher m = p.matcher(str); if(m.find()){ String head = m.group(); p = Pattern.compile("<meta[^>]*charset=([\\w\\-]*)?"); m = p.matcher(head); if(m.find()){ charset = m.group(1); charset = normalizer.get(charset); } } } //可能ならここでUniversalDetectorなどで判定するとmore better if(null == charset){ //charset not found. use default. charset = "SJIS"; logger.info("charset->not found. use default."); }else{ logger.info("charset->" + charset); } in = new ByteArrayInputStream(bytes); wis = new WritableLineReader(new InputStreamReader(in, charset)); } HTMLDocumentImpl document = new HTMLDocumentImpl(this.bcontext, this.rcontext, wis, uri); return document; }
あと、javaでの文字コードの定義とhttpレスポンスヘッダやmetaのcharsetから取った文字コード指定は一致しないことが多いので正規化する処理が必要
適当なところに
private static EncodingNormalizer normalizer = new EncodingNormalizer(); public static class EncodingNormalizer{ LinkedHashMap<String,String> encs = new LinkedHashMap<String,String>(); EncodingNormalizer(){ encs.put("UTF8","utf([\\-_])?8"); encs.put("UTF-16","utf([\\-_])?16"); encs.put("SJIS","(s|shift)([\\-_])?jis"); encs.put("EUC_JP","euc([\\-_])?jp"); encs.put("ISO2022JP","iso([\\-_])?2022([\\-_])?jp"); encs.put("ASCII","ascii"); encs.put("Cp1252","cp([\\-_])?1252"); encs.put("ISO8859_1","iso([\\-_])?8859([\\-_])?1"); encs.put("UnicodeBig","unicodebig"); encs.put("UnicodeBigUnmarked","unicodebigunmarked"); encs.put("UnicodeLittle","unicodelittle"); encs.put("UnicodeLittleUnmarked","unicodelittleunmarked"); encs.put("Big5","big([\\-_])?5"); encs.put("Cp037","cp([\\-_])?037"); encs.put("Cp273","cp([\\-_])?273"); encs.put("Cp277","cp([\\-_])?277"); encs.put("Cp278","cp([\\-_])?278"); encs.put("Cp280","cp([\\-_])?280"); encs.put("Cp284","cp([\\-_])?284"); encs.put("Cp285","cp([\\-_])?285"); encs.put("Cp297","cp([\\-_])?297"); encs.put("Cp420","cp([\\-_])?420"); encs.put("Cp424","cp([\\-_])?424"); encs.put("Cp437","cp([\\-_])?437"); encs.put("Cp500","cp([\\-_])?500"); encs.put("Cp737","cp([\\-_])?737"); encs.put("Cp775","cp([\\-_])?775"); encs.put("Cp838","cp([\\-_])?838"); encs.put("Cp850","cp([\\-_])?850"); encs.put("Cp852","cp([\\-_])?852"); encs.put("Cp855","cp([\\-_])?855"); encs.put("Cp856","cp([\\-_])?856"); encs.put("Cp857","cp([\\-_])?857"); encs.put("Cp858","cp([\\-_])?858"); encs.put("Cp860","cp([\\-_])?860"); encs.put("Cp861","cp([\\-_])?861"); encs.put("Cp862","cp([\\-_])?862"); encs.put("Cp863","cp([\\-_])?863"); encs.put("Cp864","cp([\\-_])?864"); encs.put("Cp865","cp([\\-_])?865"); encs.put("Cp866","cp([\\-_])?866"); encs.put("Cp868","cp([\\-_])?868"); encs.put("Cp869","cp([\\-_])?869"); encs.put("Cp870","cp([\\-_])?870"); encs.put("Cp871","cp([\\-_])?871"); encs.put("Cp874","cp([\\-_])?874"); encs.put("Cp875","cp([\\-_])?875"); encs.put("Cp918","cp([\\-_])?918"); encs.put("Cp921","cp([\\-_])?921"); encs.put("Cp922","cp([\\-_])?922"); encs.put("Cp930","cp([\\-_])?930"); encs.put("Cp933","cp([\\-_])?933"); encs.put("Cp935","cp([\\-_])?935"); encs.put("Cp937","cp([\\-_])?937"); encs.put("Cp939","cp([\\-_])?939"); encs.put("Cp942","cp([\\-_])?942"); encs.put("Cp942C","cp([\\-_])?942([\\-_])?c"); encs.put("Cp943","cp([\\-_])?943"); encs.put("Cp943C","cp([\\-_])?943([\\-_])?c"); encs.put("Cp948","cp([\\-_])?948"); encs.put("Cp949","cp([\\-_])?949"); encs.put("Cp949C","cp([\\-_])?949([\\-_])?c"); encs.put("Cp950","cp([\\-_])?950"); encs.put("Cp964","cp([\\-_])?964"); encs.put("Cp970","cp([\\-_])?970"); encs.put("Cp1006","cp([\\-_])?1006"); encs.put("Cp1025","cp([\\-_])?1025"); encs.put("Cp1026","cp([\\-_])?1026"); encs.put("Cp1046","cp([\\-_])?1046"); encs.put("Cp1097","cp([\\-_])?1097"); encs.put("Cp1098","cp([\\-_])?1098"); encs.put("Cp1112","cp([\\-_])?1112"); encs.put("Cp1122","cp([\\-_])?1122"); encs.put("Cp1123","cp([\\-_])?1123"); encs.put("Cp1124","cp([\\-_])?1124"); encs.put("Cp1140","cp([\\-_])?1140"); encs.put("Cp1141","cp([\\-_])?1141"); encs.put("Cp1142","cp([\\-_])?1142"); encs.put("Cp1143","cp([\\-_])?1143"); encs.put("Cp1144","cp([\\-_])?1144"); encs.put("Cp1145","cp([\\-_])?1145"); encs.put("Cp1146","cp([\\-_])?1146"); encs.put("Cp1147","cp([\\-_])?1147"); encs.put("Cp1148","cp([\\-_])?1148"); encs.put("Cp1149","cp([\\-_])?1149"); encs.put("Cp1250","cp([\\-_])?1250"); encs.put("Cp1251","cp([\\-_])?1251"); encs.put("Cp1253","cp([\\-_])?1253"); encs.put("Cp1254","cp([\\-_])?1254"); encs.put("Cp1255","cp([\\-_])?1255"); encs.put("Cp1256","cp([\\-_])?1256"); encs.put("Cp1257","cp([\\-_])?1257"); encs.put("Cp1258","cp([\\-_])?1258"); encs.put("Cp1381","cp([\\-_])?1381"); encs.put("Cp1383","cp([\\-_])?1383"); encs.put("Cp33722","cp([\\-_])?33722"); encs.put("EUC_CN","euc([\\-_])?cn"); encs.put("EUC_KR","euc([\\-_])?kr"); encs.put("EUC_TW","euc([\\-_])?tw"); encs.put("GBK","gbk"); encs.put("ISO2022CN","iso([\\-_])?2022([\\-_])?cn"); encs.put("ISO2022CN_CNS","iso([\\-_])?2022([\\-_])?cn([\\-_])?cns"); encs.put("ISO2022CN_GB","iso([\\-_])?2022([\\-_])?cn([\\-_])?gb"); encs.put("ISO2022KR","iso([\\-_])?2022([\\-_])?kr"); encs.put("ISO8859_2","iso([\\-_])?8859([\\-_])?2"); encs.put("ISO8859_3","iso([\\-_])?8859([\\-_])?3"); encs.put("ISO8859_4","iso([\\-_])?8859([\\-_])?4"); encs.put("ISO8859_5","iso([\\-_])?8859([\\-_])?5"); encs.put("ISO8859_6","iso([\\-_])?8859([\\-_])?6"); encs.put("ISO8859_7","iso([\\-_])?8859([\\-_])?7"); encs.put("ISO8859_8","iso([\\-_])?8859([\\-_])?8"); encs.put("ISO8859_9","iso([\\-_])?8859([\\-_])?9"); encs.put("ISO8859_13","iso([\\-_])?8859([\\-_])?13"); encs.put("ISO8859_15_FDIS","iso([\\-_])?8859([\\-_])?15([\\-_])?fdis"); encs.put("JIS0201","jis([\\-_])?0201"); encs.put("JIS0208","jis([\\-_])?0208"); encs.put("JIS0212","jis([\\-_])?0212"); encs.put("Johab","johab"); encs.put("KOI8_R","koi([\\-_])?8([\\-_])?r"); encs.put("MS874","ms([\\-_])?874"); encs.put("MS932","ms([\\-_])?932"); encs.put("MS936","ms([\\-_])?936"); encs.put("MS949","ms([\\-_])?949"); encs.put("MS950","ms([\\-_])?950"); encs.put("MacArabic","macarabic"); encs.put("MacCentralEurope","maccentraleurope"); encs.put("MacCroatian","maccroatian"); encs.put("MacCyrillic","maccyrillic"); encs.put("MacDingbat","macdingbat"); encs.put("MacGreek","macgreek"); encs.put("MacHebrew","machebrew"); encs.put("MacIceland","maciceland"); encs.put("MacRoman","macroman"); encs.put("MacRomania","macromania"); encs.put("MacSymbol","macsymbol"); encs.put("MacThai","macthai"); encs.put("MacTurkish","macturkish"); encs.put("MacUkraine","macukraine"); encs.put("TIS620","tis([\\-_])?620"); } public String get(String str){ Pattern p; Matcher m; for(String enc: encs.keySet()){ p = Pattern.compile(encs.get(enc), Pattern.CASE_INSENSITIVE); m = p.matcher(str); if(m.find()){ return enc; } } return null; } }
を追加