用了jsoup和nutzDao来实现
上代码
static NutDao dao = new NutDao();
static {
Properties pp = System.getProperties();
pp.put("driverClassName", "com.mysql.jdbc.Driver");
pp.put("url", "jdbc:mysql://localhost:3306/wordpress?useUnicode=true&characterEncoding=utf-8");
pp.put("username", "root");
pp.put("password", "000000");
DataSource ds = null;
try {
ds = BasicDataSourceFactory.createDataSource(System.getProperties());
dao.setDataSource(ds);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws MalformedURLException, IOException, InterruptedException {
//createSqlFile();
// System.out.println("你好${1}".replace("${1}", "world"));
}
public static void createSqlFile() throws IOException{
BufferedReader reader = new BufferedReader(new FileReader(new File("javaeyepost.txt")));
String line = "";
boolean b = false;
String title = null;
String content = null;
String date = null;
int index = 25;
while((line=reader.readLine())!= null){
if(b){
break;
}
if(line.length()>0){
if(line.equals("post****over")){
}else if(line.startsWith("title: ")){
title = line.substring(7);
}else if(line.startsWith("date: ")){
System.out.println(line);
date = line.substring(6)+" 00:00:00";
}else if(line.startsWith("content: ")){
StringBuffer sb = new StringBuffer();
sb.append(line.substring(9));
while((line=reader.readLine())!= null){
if(line.equals("post*****over")){
content = sb.toString();
String encode = URLEncoder.encode(title);
encode = encode.length()>200? encode.substring(0,200):encode;
dao.insert("wp_posts", Chain.make("post_author", 1).add("post_date", date)
.add("post_date_gmt", date).add("post_content", content).add("post_title", title)
.add("post_status", "publish").add("comment_status", "open").add("ping_status", "open")
.add("post_name", encode).add("post_modified", date).add("post_modified_gmt", date)
.add("post_parent", 0).add("guid", "http://localhost:89/?p="+ index++).add("menu_order", 0)
.add("post_type", "post").add("comment_count", 0).add("post_excerpt", "").add("post_password", "")
.add("to_ping", "").add("pinged", "").add("post_content_filtered", "").add("post_mime_type", ""));
// System.out.println("*************************************");
//b = true;
break;
}else{
sb.append(line);
}
}
}
}
}
}
// 需要注意的细节, 帖子类别可能不存在,要抓取下面的分页信息才行
public static void fetchPost() throws UnsupportedEncodingException, IOException, InterruptedException{
//Document document = Jsoup.parse(new URL("http://feiyan35488.iteye.com/?show_full=false"), 5000);
FileOutputStream fos = new FileOutputStream("javaeyepost.txt");
HttpConnection con = (HttpConnection) HttpConnection.connect("http://feiyan35488.iteye.com/?page=9&&show_full=true");
con.userAgent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.12 (KHTML, like Gecko) Chrome/9.0.576.0 Safari/534.12");
Document document = con.get();
Elements es = document.getElementsByClass("blog_main");
System.out.println("共有帖子 :"+es.size());
for(Element e : es){
fos.write(("title:"+e.child(0).child(2).child(0).html()+"\n").getBytes("utf-8"));
if(e.child(0).children().size()>3)
fos.write(("classify: "+e.child(0).child(3).child(0).html()+"\n").getBytes("utf-8"));
fos.write(("date: "+e.child(0).child(0).child(0).html()+"-"+e.child(0).child(0).child(2).html()+"-"+e.child(0).child(0).child(4).html()+"\n").getBytes("utf-8"));
fos.write(("content: "+e.child(1).html()+"\n").getBytes("utf-8"));
fos.write(("post*****over\n").getBytes("utf-8"));
//System.out.println("标题:"+e.child(0).child(2).child(0).html());
}
for(int i=2;i<10;i++){
Thread.sleep(5000);
con.url("http://feiyan35488.iteye.com/?page="+i+"&&show_full=true");
document = con.get();
Elements es1 = document.getElementsByClass("blog_main");
System.out.println("共有帖子 :"+es1.size());
for(Element e : es1){
fos.write(("title:"+e.child(0).child(2).child(0).html()+"\n").getBytes("utf-8"));
fos.write(("classify: "+e.child(0).child(3).child(0).html()+"\n").getBytes("utf-8"));
fos.write(("date: "+e.child(0).child(0).child(0).html()+"-"+e.child(0).child(0).child(2).html()+"-"+e.child(0).child(0).child(4).html()+"\n").getBytes("utf-8"));
fos.write(("content: "+e.child(1).html()+"\n").getBytes("utf-8"));
fos.write(("post*****over\n").getBytes("utf-8"));
//System.out.println("标题:"+e.child(0).child(2).child(0).html());
}
}
}两个方法,一 抓取帖子存到文件中,二,从文件中读取保存到数据库中