数据抓取 HTMLUNIT |
|
|
最近在用Jsoup抓取某网站数据,可有些页面是ajax请求动态生成的,去群里问了一下,大神说模拟ajax请求即可。去网上搜索了一下,发现了这篇文章,拿过来先用着试试。转帖如下:
网上关于网络爬虫实现方式有很多种,但是很多都不支持Ajax,李兄说:模拟才是王道。确实,如果能够模拟一个没有界面的浏览器,还有什么不能做到的呢? 关于解析Ajax网站的框架也有不少,我选择了HtmlUnit,官方网站:http://htmlunit.sourceforge.net /,htmlunit可以说是一个Java版本的无界面浏览器,几乎无所不能,而且很多东西都封装得特别完美。这是这几天来积累下来的心血,记录一下。
package com.lanyotech.www.wordbank;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.util.List;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ScriptResult;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlOption;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;
public class WorldBankCrawl {
private static String TARGET_URL = "http://databank.worldbank.org/ddp/home.do";
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
//模拟一个浏览器
WebClient webClient = new WebClient();
//设置webClient的相关参数
webClient.setJavaScriptEnabled(true);
webClient.setCssEnabled(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.setTimeout(35000);
webClient.setThrowExceptionOnScriptError(false);
//模拟浏览器打开一个目标网址
HtmlPage rootPage= webClient.getPage(TARGET_URL);
//获取第一个数据库
HtmlSelect hs = (HtmlSelect) rootPage.getElementById("lstCubes");
//按要求选择第一个数据库
hs.getOption(0).setSelected(true);
//模拟点击Next按钮,跳转到第二个页面
System.out.println("正在跳转…");
//执行按钮出发的js事件
ScriptResult sr = rootPage.executeJavaScript("javascript:setCubeData(2,-1,4,'/ddp');");
//跳转到第二个页面,选择国家
HtmlPage countrySelect = (HtmlPage) sr.getNewPage();
//获得包含全部国家信息的选择框页面
HtmlPage framePage=(HtmlPage)countrySelect.getFrameByName("frmTree1″).getEnclosedPage();
//获得selectAll按钮,触发js事件
framePage.executeJavaScript("javascript:TransferListAll(‘countrylst','countrylstselected','no');SetSelectedCount(‘countrylstselected','tdcount');");
//获取Next按钮,触发js事件
ScriptResult electricityScriptResult = framePage.executeJavaScript("javascript:wrapperSetCube('/ddp')");
System.out.println("正在跳转…");
//跳转到下一个页面electricitySelect
HtmlPage electricitySelect = (HtmlPage) electricityScriptResult.getNewPage();
//获得electricity选择的iframe
HtmlPage electricityFrame = (HtmlPage) electricitySelect.getFrameByName("frmTree1″).getEnclosedPage();
//获得选择框
HtmlSelect seriesSelect = (HtmlSelect) electricityFrame.getElementById("countrylst");
//获得所有的选择框内容
List optionList = seriesSelect.getOptions();
//将指定的选项选中
optionList.get(1).setSelected(true);
//模拟点击select按钮 electricityFrame.executeJavaScript("javascript:TransferList('countrylst','countrylstselected','no');SetSelectedCount('countrylstselected','tdcount');");
//获取选中后,下面的选择框
HtmlSelect electricitySelected = (HtmlSelect) electricityFrame.getElementById("countrylstselected");
List list = electricitySelected.getOptions();
//模拟点击Next按钮,跳转到选择时间的页面
ScriptResult timeScriptResult = electricityFrame.executeJavaScript("javascript:wrapperSetCube('/ddp')");
System.out.println("正在跳转…");
HtmlPage timeSelectPage = (HtmlPage) timeScriptResult.getNewPage();
//获取选中时间的选择框
timeSelectPage = (HtmlPage) timeSelectPage.getFrameByName("frmTree1″).getEnclosedPage();
//选中所有的时间 timeSelectPage.executeJavaScript("javascript:TransferListAll('countrylst','countrylstselected','no');SetSelectedCount('countrylstselected','tdcount');");
//点击Next按钮
ScriptResult exportResult = timeSelectPage.executeJavaScript("javascript:wrapperSetCube('/ddp')");
System.out.println("正在跳转…");
//转到export页面
HtmlPage exportPage = (HtmlPage) exportResult.getNewPage();
//点击页面上的Export按钮,进入下载页面
ScriptResult downResult = exportPage.executeJavaScript("javascript:exportData('/ddp' ,'EXT_BULK' ,'WDI_Time=51||WDI_Series=1||WDI_Ctry=244||' );");
System.out.println("正在跳转…");
HtmlPage downLoadPage = (HtmlPage) downResult.getNewPage();
//点击Excel图标,开始下载
ScriptResult downLoadResult = downLoadPage.executeJavaScript("javascript:exportData('/ddp','BULKEXCEL');");
//下载Excel文件
InputStream is = downLoadResult.getNewPage().getWebResponse().getContentAsStream();
OutputStream fos = new FileOutputStream("d://test.xls");
byte[] buffer=new byte[1024*30];
int len=-1;
while((len=is.read(buffer))>0){
fos.write(buffer, 0, len);
}
fos.close();
fos.close();
System.out.println("Success!");
}
}
|
js mobile |
|
|
sencha touch >kendo ui >jquery mobile
|
wechat_root |
|
|
<%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%>
<%@include file="common/common.jsp" %>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport"
content="width=device-width, minimum-scale=1, maximum-scale=1">
<title>WeChat Root 20131030</title>
<script type="text/javascript">
NIUCHEURL = "http://1.weixinniuche1.duapp.com";
var HOST = window.location.host;
//console.log(window.location.host);
//console.log(window.location.pathname);
//console.log(window.location.port);
DOMAIN = "http://" + HOST + WEB_CONTENT;//url_temp.substring(0, url_temp.indexOf("/root.jsp"));
var openId;
var dealerId;
var orderId;
$(document).on("pageshow", "#homePage", function(event, data) {
openId = "openIdTxt";//$("#openIdTxt").val();
dealerId = "";//$("#dealerIdTxt").val();
orderId = "";//$("#orderIdTxt").val();
});
function linkToNiuChe(key) {
var toLink = DOMAIN;
toLink += "/Menu?MsgType=event&Event=CLICK&EventKey=" + key + "&FromUserName=" + openId + "&dealerId=" + dealerId;
//ajaxCallCore(toLink);
callByXMLHttpRequest(toLink,key);
/*
if ("my_car_01" == key) {// "行车信息";
toLink += "/coreServlet?MsgType=event&Event=CLICK&EventKey=" + key
+ "&FromUserName=" + openId + "&dealerId=" + dealerId;
ajaxCallCore(toLink);
return;
//toLink += "/travelInfo.do?flag=travelInfo&openID="+openId+"&dealerId="+dealerId;
} else if ("my_car_02" == key) {// "驾驶行为";
} else if ("my_car_03" == key) {// "车辆体检";
toLink += "/discoverHealth.do?flag=cardiagnosis&openID=" + openId
+ "&dealerId=" + dealerId;
} else if ("my_car_04" == key) {// "车辆位置";
toLink += "/carmessage/carMap.jsp";//?lng="+map.get("lNG").toString()+"&lat="+map.get("lAT").toString();
} else if ("my_car_05" == key) {// "维修历史";
toLink += "/repairRe.do?flag=repairrecord&openID=" + openId
+ "&dealerId=" + dealerId;
} else if ("more_01" == key) {// "违章信息";
toLink += "/getBreakRegulRecMm.do?flag=breakregule&openID="
+ openId + "&dealerId=" + dealerId;
} else if ("more_02" == key) {// "二手车信息";
//toLink += "/usedCarListMm.do?flag=usedCar&openID="+openId+"&modelId="+mt.get("modelId")+"&modelName="+modelName+"&dealerId="+dealerId;
} else if ("more_03" == key) {// "远程打火";
} else if ("more_04" == key) {// "切换车辆";
toLink += "/carBindDrmNum.do?flag=carbinddrm&openID=" + openId
+ "&dealerId=" + dealerId;
} else if ("main_introduce" == key) {// "简介";
//toLink +=
} else if ("main_appointment" == key) {// "预约/查询";
//"/dealersInfo.do?flag=getDealersInfo&openID="+openId+"&dealerId="+dealerId;
toLink += "/activeService.do?flag=activeService&openID=" + openId
+ "&orderID=" + orderId + "&dealerId=" + dealerId;
}
$.mobile.changePage(toLink);
*/
}
function callByXMLHttpRequest(url,eventKey) {
if (window.XMLHttpRequest){
req = new XMLHttpRequest();
}
else if (window.ActiveXObject){
req = new ActiveXObject("Microsoft.XMLHTTP");
}
req.open("Post",url,true);
req.onreadystatechange = callback;
var xmlStr =
"<xml>" +
"<FromUserName><![CDATA[" + openId + "]]></FromUserName>" +
"<MsgType><![CDATA[event]]></MsgType>" +
"<Event><![CDATA[CLICK]]></Event>" +
"<EventKey><![CDATA[" + eventKey + "]]></EventKey>" +
"</xml>";
req.send(xmlStr);
}
function callback(data) {
if(data.currentTarget.readyState == "4") {
if(data.currentTarget.status == "200") {
resolveXML($.parseXML(data.currentTarget.responseText));
} else {
alert("网络不稳定,请稍后再试!");
}
}
}
function resolveXML(xmlDoc) {
msgType = $(xmlDoc).find("MsgType").text();
var li_str;
if(msgType == "text") {
description = $(xmlDoc).find("Content").text();
li_str = '<li data-icon="false"><div data-role="button" style="cursor:default">';
li_str += '<pre style="text-align:left;word-wrap:break-word;">' + description + '</pre>';
li_str += '</div></li>';
} else {
$(xmlDoc).find("item").each(function() {
var element = $(this);
title = element.find("Title").text();
description = element.find("Description").text();
picurl = element.find("PicUrl").text();
url = element.children("Url").text();
url = url.replace(NIUCHEURL,DOMAIN);
console.log(element.find("Description").text());
});
li_str = '<li data-icon="false"><a href="' + url + '" target="" data-role="button" >';
li_str += '<h3 style="text-align:left;margin:0px">' + title + '</h3><br>';
li_str += '<pre style="text-align:left;word-wrap:break-word;">' + description + '</pre><br>';
li_str += '</a></li>';
}
$("#msglist").append(li_str).trigger('create');
$("#msgContent").scrollTop(
$("#msgContent")[0].scrollHeight);
}
</script>
</head>
<body>
<!-- /page -->
<div data-role="page" id="homePage">
<!-- /header -->
<!--
<div data-role="header" data-theme="b">
<h1>WeChartRoot 20131030</h1>
<a href="./WeChartRoot20131030.html" data-icon="home"
data-iconpos="notext" data-direction="reverse"
class="ui-btn-right jqm-home">Home</a>
</div>
-->
<!-- /content -->
<div data-role="content">
<!--
<div style="width: 100%">
<form>
<label>OpenId</label>
<input type="text" data-clear-btn="true" id="openIdTxt" value="openIdTest">
<label>DealerId</label>
<input type="text" data-clear-btn="true" id="dealerIdTxt" value="dealerIdTest">
<label>OrderId</label>
<input type="text" data-clear-btn="true" id="orderIdTxt" value="orderIdTest">
</form>
</div>
-->
<div id="msgContent"
style="width: 100%; height: 200px; float: left; overflow: auto">
<div style="position: relative; padding: 30px">
<ol data-role="listview" id="msglist">
</ol>
</div>
</div>
<div style="width: 33%; float: left">
<ul data-role="listview" data-inset="true">
<li data-role="list-divider">我的座驾</li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('my_car_01')">行车信息</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('my_car_02')">驾驶行为</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('my_car_03')">车辆体检</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('my_car_04')">车辆位置</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('my_car_05')">维修历史</a></li>
</ul>
</div>
<div style="width: .5%; float: left">
<p></p>
</div>
<div style="width: 33%; float: left">
<ul data-role="listview" data-inset="true">
<li data-role="list-divider">VIP尊享</li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('main_introduce')">简介</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('main_appointment')">预约/查询</a></li>
<li></li>
<li data-icon="false"><a href="javascript:window.location.href=DOMAIN+'/getLoginPage'" >身份验证</a></li>
</ul>
</div>
<div style="width: .5%; float: left">
<p></p>
</div>
<div style="width: 33%; float: left">
<ul data-role="listview" data-inset="true">
<li data-role="list-divider">更多服务</li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('more_01')">违章信息</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('more_02')">二手车信息</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('more_03')">远程打火</a></li>
<li data-icon="false"><a href="#" onClick="linkToNiuChe('more_04')">切换车辆</a></li>
<!-- <li data-icon="false"><a href="#" onClick="linkToNiuChe('more_05')">关于我们</a></li> -->
</ul>
</div>
</div>
</div>
</body>
</html>
|
LOG4J |
|
|
log4j的官方wiki
[url]http://wiki.apache.org/logging-log4j/Log4jXmlFormat[/url]
[code="xml"]
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="console" class="org.apache.log4j.ConsoleAppender">
<param name="Target" value="System.out"/>
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%-5p %c{1} - %m%n"/>
</layout>
</appender>
<root>
<priority value ="debug" />
<appender-ref ref="console" />
</root>
</log4j:configuration>
[/code]
这是官网给出的一个最基本的配置
[code="xml"]
<root>
<priority value="debug" /><!-- 日志中输出级别,这里输出debug级别以上的日志 debug、info、warn、error、fatal -->
<appender-ref ref="CONSOLE" /><!-- 在控制台输出 -->
<appender-ref ref="FILE" /><!-- 在文件输出 -->
</root>
[/code]
这里面配置了 当前项目的全局日志输出情况,以两种形式输出,控制台和文件
[color=red]---------记录日志的多种输出方式 start---------[/color]
这里面的CONSOLE和FILE,是引用,分别引用下面的代码
[code="xml"]<appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender"><!-- 控制台输出 -->
<layout class="org.apache.log4j.PatternLayout"><!-- 输出布局模式 -->
<param name="ConversionPattern" value="%d - %c -%-4r [%t] %-5p %x - %m%n" /><!-- 输出格式 -->
</layout>
<!--限制输出级别-->
<filter class="org.apache.log4j.varia.LevelRangeFilter">
<param name="LevelMax" value="ERROR"/>
<param name="LevelMin" value="ERROR"/>
</filter>
</appender>[/code]
者里面配置了在控制台输出的具体配置
使用了ConsoleAppender,另外布局使用了PatternLayout
filter的意思就是使用了过滤,讲这一块的内容再通过过滤器来过滤一遍,分级别高低,在这区间内的可以显示出来
(注意这里面参数的大小写)
还有其他的输出,比如通过JDBC输出到数据库、通过SMTP发送邮件,还有下面一段代码输出到File文件等
布局模式也有多种,比如输出成html网页形式等,具体可以参看官网介绍
关于输出格式
%p 输出优先级,即DEBUG,INFO,WARN,ERROR,FATAL
%r 输出自应用启动到输出该log信息耗费的毫秒数
%c 输出所属的类目,通常就是所在类的全名
%t 输出产生该日志事件的线程名
%n 输出一个回车换行符,Windows平台为“/r/n”,Unix平台为“/n”
%d 输出日志时间点的日期或时间,默认格式为ISO8601,也可以在其后指定格式,比如:%d{yyy MMM dd HH:mm:ss,SSS},输出类似:2002年10月18日 22:10:28,921
%l 输出日志事件的发生位置,包括类目名、发生的线程,以及在代码中的行数。举例:Testlog4.main(TestLog4.java:10)
下面是输出到文件的配置
[code="xml"]
<appender name="FILE" class="org.apache.log4j.FileAppender"><!-- 文件模式输出 -->
<param name="File" value="C:/log4j1.log"/><!-- 输出到的位置 -->
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern"
value="%d - %c -%-4r [%t] %-5p %x - %m%n" />
</layout>
</appender>
[/code]
[code="xml"]
<!-- ========================== 输出方式说明================================ -->
<!-- Log4j提供的appender有以下几种: -->
<!-- org.apache.log4j.ConsoleAppender(控制台), -->
<!-- org.apache.log4j.FileAppender(文件), -->
<!-- org.apache.log4j.DailyRollingFileAppender(每天产生一个日志文件), -->
<!-- org.apache.log4j.RollingFileAppender(文件大小到达指定尺寸的时候产生一个新的文件), -->
<!-- org.apache.log4j.WriterAppender(将日志信息以流格式发送到任意指定的地方) -->
<!-- ========================================================================== -->
<!-- 输出到日志文件 -->
<appender name="filelog_appender"
class="org.apache.log4j.RollingFileAppender">
<!-- 设置File参数:日志输出文件名 -->
<param name="File" value="log/testlog4jxml_all.log" />
<!-- 设置是否在重新启动服务时,在原有日志的基础添加新日志 -->
<param name="Append" value="true" />
<!-- 设置文件大小 -->
<param name="MaxFileSize" value="1MB" />
<!-- 设置文件备份 -->
<param name="MaxBackupIndex" value="10000" />
<!-- 设置输出文件项目和格式 -->
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yyyy-MM-dd HH:mm:ss} %-5p (%c:%L)- %m%n" />
</layout>
</appender>
<!-- 输出到日志文件 每天一个日志 -->
<appender name="filelog_daily" class="org.apache.log4j.DailyRollingFileAppender">
<param name="File" value="log/daily.log" />
<param name="DatePattern" value="'daily.'yyyy-MM-dd'.log'" />
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="[%d{yyyy-MM-dd HH:mm:ss\} %-5p] [%t] (%c:%L) - %m%n" />
</layout>
</appender>
[/code]
以上分别是记录到文件的集中方式
http://www.cnblogs.com/tqsummer/archive/2010/08/26/1809232.html
以上代码则记录了,通过配置SMTP来发送日志的方式
[code="xml"]
<appender name="DATABASE" class="org.apache.log4j.jdbc.JDBCAppender">
<param name="URL" value="jdbc:oracle:thin:@192.168.0.59:1521:oanet"/>
<param name="driver" value="oracle.jdbc.driver.OracleDriver"/>
<param name="user" value="hdczoa"/>
<param name="password" value="system"/>
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern"
value="INSERT INTO hdczoa.LOG4J(stamp,thread, info_level,class,message) VALUES ('%d', '%t', '%p', '%c', %m)" />
</layout>
</appender>
[/code]
以上代码则记录了通过配置JDBC来实现日志插入数据库
[color=red]---------记录日志的多种输出方式 end---------[/color]
关于category 和logger
[code="xml"]
<logger name="com.abc" additivity="false">
<level value="WARN" />
<appender-ref ref="CONSOLE" />
</logger>
[/code]
这段代码是针对指定的包来设置日志输出情况的
其中的appender-ref同上
而category是和logger几乎相同的,logger继承于category,现在已经不提倡使用category了
log4j的配置大概也就这些,欢迎补充
|
html edge code 配色 |
|
|
标签名 69 103 140
属性名 122 179 108
值 30 155 152
文字 0 2 178
注释 222 198 165
|
sublime |
|
|
obsidian
eclipse color theme
|
jquery.easy-pie-chart |
|
|
jquery.easy-pie-chart
http://developer.51cto.com/art/201208/351584.htm
http://www.elated.com/articles/snazzy-animated-pie-chart-html5-jquery/
|
mac hotkey |
|
|
option + space == spotlight
fn+left & fn + end == home & end
command + shift + F == full screen
option + W == Launchpad
control + F2 == menubar
|
代码高亮 |
|
|
代码高亮
Syntax Highlighter
CodeMirror
https://code.google.com/p/as3syntaxhighlight/
https://code.google.com/p/as3-commons/source/browse/trunk/as3-commons-asblocks/src/test/actionscript/org/as3commons/asblocks/impl/CodeMirror.as?r=1284
http://labs.searchcoders.com/text/
http://marijn.haverbeke.nl/codemirror/index.html
|
Trade |
|
|
国内国外 股票、证券、基金
订单交易
订单 约定定以及失效管理
市场商品发行一览及管理
入出金管理
T+0、T+1……营业日交易一览
所有证券交易一览
个人及法人账户管理
营业日 手续费等其他小功能管理
|