Kaynağa Gözat

测试爬虫

wangmiaomiao 1 yıl önce
ebeveyn
işleme
bf3c0caed3

+ 11 - 0
slibra-admin/pom.xml

@@ -77,6 +77,17 @@
             <scope>test</scope>
         </dependency>
 
+        <dependency>
+            <groupId>us.codecraft</groupId>
+            <artifactId>webmagic-core</artifactId>
+            <version>0.7.5</version>
+        </dependency>
+        <dependency>
+            <groupId>us.codecraft</groupId>
+            <artifactId>webmagic-extension</artifactId>
+            <version>0.7.5</version>
+        </dependency>
+
 
 
 

+ 42 - 0
slibra-admin/src/main/java/com/slibra/web/controller/business/OschinaBlogPageProcessor.java

@@ -0,0 +1,42 @@
+package com.slibra.web.controller.business;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.pipeline.ConsolePipeline;
+import us.codecraft.webmagic.pipeline.JsonFilePipeline;
+import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.List;
+
+public class OschinaBlogPageProcessor implements PageProcessor {
+
+    private Site site = Site.me().setDomain("my.oschina.net");
+
+    @Override
+    public void process(Page page) {
+        List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
+        page.addTargetRequests(links);
+        page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
+        page.putField("content", page.getHtml().$("div.content").toString());
+        page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+
+    }
+
+    public static void main(String[] args) {
+        ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
+        Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog")
+//                .addPipeline(new ConsolePipeline()).run();
+                .addPipeline(resultItemsCollectorPipeline).run();
+        List<ResultItems> collected = resultItemsCollectorPipeline.getCollected();
+        System.out.println(collected);
+
+    }
+}

+ 30 - 0
slibra-admin/src/main/java/com/slibra/web/controller/business/Test.java

@@ -0,0 +1,30 @@
+package com.slibra.web.controller.business;
+
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.example.OschinaBlog;
+import us.codecraft.webmagic.model.ConsolePageModelPipeline;
+import us.codecraft.webmagic.model.OOSpider;
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.model.annotation.TargetUrl;
+
+import java.util.List;
+
+@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
+public class Test {
+
+
+        @ExtractBy("//title")
+        private String title;
+
+        @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
+        private String content;
+
+        @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
+        private List<String> tags;
+
+        public static void main(String[] args) {
+            OOSpider.create(
+                    Site.me(),
+                    new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run();
+        }
+    }