编程开源技术交流,分享技术与知识

网站首页 > 开源技术 正文

webhave爬虫抓取京东数据

wxchong 2024-06-10 16:49:30 开源技术 55 ℃ 0 评论

本人从事在线教育c++十年工作经验现在精心整理了一套从小白到项目实践开发各种学习资料如果你想学想加入我们请关注我在私信回复“编程”就可以领取学习资料!!!

个人分类: java

抓取京东店铺商品信息

webmagic使用maven管理

<dependency>
 <groupId>us.codecraft</groupId>
 <artifactId>webmagic-core</artifactId>
 <version>0.7.2</version>
</dependency>
<dependency>
 <groupId>us.codecraft</groupId>
 <artifactId>webmagic-extension</artifactId>
 <version>0.7.2</version>
</dependency>
//传店铺url
flag= WebMagic.running(shopUrl);
//京东店铺商品信息有的在页面能直接抓取,有的需要分析有数据的js
//分析js,游览器 f12 f5
//根据页面信息,拼接js
public class WebMagic implements PageProcessor{
	static Integer flag ;
	public static Integer running(String url) {
		Spider.create(new WebMagic())
		.addUrl(url)
		.addPipeline(new ConsolePipeline())
		.thread(5).run();	
		return flag;
	}
	public static final String URL_POST ="(http[s]{0,1})://\\w+\\.jd\\.com/view_search-\\S+\\.html";//正则匹配规则
	public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";// //自营店铺
	public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S";//匹配正则url	 //
	private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
	public Site getSite() {
		return site;
	}
	public void process(Page page) {
		if(page.getUrl().regex(URL_POST).match()){
			//获取商品类目categoryId和appId
			String categoryId="";
			String appId="";
			String orderBy="";
			String direction="";
			String pageSize="";
			String pageNo="";
			String url=page.getUrl().toString();
			String [] sub_url_array = url.split("-"); 
			if (sub_url_array != null && sub_url_array.length >0) {
				
					appId =sub_url_array[1];
					categoryId=sub_url_array[2];
					orderBy=sub_url_array[3];
					direction=sub_url_array[4];
					pageSize =sub_url_array[5];
					//pageNo=sub_url_array[6];
					pageNo=sub_url_array[6].replaceAll(".html", "");
				
			}
			page.putField("pageInstanceId",page.getHtml().xpath("//[@id='pageInstance_id']/@value").all());
			page.putField("venderId",page.getHtml().xpath("//[@id='vender_id']/@value").all());
		
				page.putField("instanceid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure']/@m_render_instance_id").all());
				page.putField("prototypeid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_prototype_id").all());
				page.putField("templateId",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_template_id").all());
			
			page.putField("shopId",page.getHtml().xpath("//[@id='shop_id']/@value").all());
			List<String> pageInstanceIds = (List<String>) page.getResultItems().get("pageInstanceId");
			List<String> venderIds = (List<String>) page.getResultItems().get("venderId");
			List<String> instanceids=(List<String>) page.getResultItems().get("instanceid");
			List<String> prototypeids=(List<String>) page.getResultItems().get("prototypeid");
			List<String> templateIds=(List<String>) page.getResultItems().get("templateId");
			List<String> shopIds=(List<String>) page.getResultItems().get("shopId");
			String pageInstanceId="";
			String venderId="";
			String instanceid="";
			String prototypeid="";
			String templateId="";
			String shopId="";
			if (pageInstanceIds != null && pageInstanceIds.size()>0) {
				pageInstanceId=pageInstanceIds.get(0);
				venderId=venderIds.get(0);
				instanceid=instanceids.get(0);
				prototypeid=prototypeids.get(0);
				templateId=templateIds.get(0);
				
				shopId=shopIds.get(0);
			}
			//当前时间戳获取
			String res;
			Date date = new Date();
			long ts = date.getTime();
			res = String.valueOf(ts);
			for (int i = 1; i <5; i++) {
				String surl="";
				
					surl="http://module-jshop.jd.com/module/getModuleHtml.html?appId="+appId+"&orderBy="+orderBy+"&pageNo="+i+"&direction="+direction+"&categoryId="+categoryId+"&pageSize="+pageSize+"&pagePrototypeId=8&pageInstanceId="+pageInstanceId+"&moduleInstanceId="+instanceid+"&prototypeId="+prototypeid+"&templateId="+templateId+"&layoutInstanceId="+instanceid+"&origin=0&shopId="+shopId+"&venderId="+venderId+"&callback=jshop_module_render_callback&_="+res;
				
				JDItemJsonPreocessor.running(surl);
			}
			flag=200; 
		}
	}
	
}
//获取商品数据,价格sku,名称等信息
static Map<String,String> maps = new HashMap<String, String>();
		private PageInfo pages = new PageInfo();
		private ShopItem shopitem;
		private ShopInfo shopinfo;
		private List<ShopInfo> shopInfolist;//店铺信息
		private List<ShopItem> shopItemlist;
		 
	 public static Map<String,String> running(String url) {
	 	
	 Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
	 return maps;
	 }
	 private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); 
	 ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
	 ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");
	 public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
	 //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
	 public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
	 public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url	
static Map<String,String> maps = new HashMap<String, String>();
		private PageInfo pages = new PageInfo();
		private ShopItem shopitem;
		private ShopInfo shopinfo;
		private List<ShopInfo> shopInfolist;//店铺信息
		private List<ShopItem> shopItemlist;
		 
	 public static Map<String,String> running(String url) {
	 	
	 Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
	 return maps;
	 }
	 private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); 
	 ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
	 ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");
	 public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
	 //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
	 public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
	 public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url	
page.putField("id",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[3]/div/span[2]/@jdprice").all());
							page.putField("name",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[2]/a/text()").all());
							page.putField("img",page.getHtml().xpath("//div/div/div/ul/li/div/div[1]/a/img/@original").all());
						
	 		 List<String> ids = (List<String>) page.getResultItems().get("id");
		 List<String> name = (List<String>) page.getResultItems().get("name");
		 List<String> imgs=(List<String>) page.getResultItems().get("img");
		 
 	 String makerUrl = makerUrl(ids);
	 Map<String, String> running = JDJsonPreocessor.running(makerUrl);//拼接价格js
	 for (int i = 0; i < name.size(); i++) {
	 String price = running.get("J_"+ids.get(i));
	 	 String ItemId=ids.get(i);
	 	 String productname =name.get(i);
	 	 String pImg="";
	 	 pImg="http:"+imgs.get(i).replaceAll("\\\\\"", "");
 public String makerUrl(List<String> ids){
	 	 StringBuffer sb = new StringBuffer();
	 	 for (String id : ids) {
	 	 sb.append("J_"+id+",");
	 	 }
	 	 String substring = sb.substring(0, sb.length()-1);
	 	 //获取时间戳
	 	 String res;
	 Date date = new Date();
	 long ts = date.getTime();
	 res = String.valueOf(ts);
	 	 return "http://p.3.cn/prices/mgets?callback=jQuery3944635&skuIds="+substring+"&_="+res;
	 	 }
//获取价格信息
package com.huanovo.fxprice.service.impl;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.huanovo.fxprice.util.JsonUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class JDJsonPreocessor implements PageProcessor{
	 static Map<String,String> maps = new HashMap<String, String>();
	 
	 
	 public static Map<String,String> running(String url) {
	 Spider.create(new JDJsonPreocessor()).addUrl(url).run();
	 return maps;
	 }
	 private Site site = Site.me()
	 .setRetryTimes(3)
	 .setSleepTime(100)
	 .addHeader("Accept-Encoding", "/")
	 .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36");
	 public Site getSite() {
	 return site;
	 }
	 public void process(Page page) {
	 page.setSkip(true);
	 String text = page.getRawText();
	 int begin = text.indexOf("[");
	 int end = text.indexOf("]");
	 String substring = text.substring(begin, end + 1);
	 String jsonName = "result";
	 String json = "{\"" + jsonName + "\":" + substring + "}";
	 Map<String, Object> map = JsonUtil.jsonToMap(json);
	 List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName);
	 for (Map<String, Object> map1 : list) {
	 String key = map1.get("id").toString();
	 String value = map1.get("p").toString();
	 maps.put(key, value);
	 }
	 }
}
主要就是 ,1.拿到链接分析页面数据 2.模拟链接访问3.xpath提取页面信息over

本人从事在线教育c++十年工作经验现在精心整理了一套从小白到项目实践开发各种学习资料如果你想学想加入我们请关注我在私信回复“编程”就可以领取学习资料!!!

Tags:

本文暂时没有评论,来添加一个吧(●'◡'●)

欢迎 发表评论:

最近发表
标签列表