|
@@ -0,0 +1,42 @@
|
|
|
+package com.slibra.web.controller.business;
|
|
|
+
|
|
|
+import us.codecraft.webmagic.Page;
|
|
|
+import us.codecraft.webmagic.ResultItems;
|
|
|
+import us.codecraft.webmagic.Site;
|
|
|
+import us.codecraft.webmagic.Spider;
|
|
|
+import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
|
|
+import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
|
|
+import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
|
|
|
+import us.codecraft.webmagic.processor.PageProcessor;
|
|
|
+
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
+public class OschinaBlogPageProcessor implements PageProcessor {
|
|
|
+
|
|
|
+ private Site site = Site.me().setDomain("my.oschina.net");
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void process(Page page) {
|
|
|
+ List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
|
|
+ page.addTargetRequests(links);
|
|
|
+ page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
|
|
|
+ page.putField("content", page.getHtml().$("div.content").toString());
|
|
|
+ page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public Site getSite() {
|
|
|
+ return site;
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main(String[] args) {
|
|
|
+ ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
|
|
|
+ Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog")
|
|
|
+// .addPipeline(new ConsolePipeline()).run();
|
|
|
+ .addPipeline(resultItemsCollectorPipeline).run();
|
|
|
+ List<ResultItems> collected = resultItemsCollectorPipeline.getCollected();
|
|
|
+ System.out.println(collected);
|
|
|
+
|
|
|
+ }
|
|
|
+}
|