项目地址:
https://github.com/Glf9832/ScrapyJingDong.git
主要代码:
# -*- coding: utf-8 -*-
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # https://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class ScrapyjdItem(scrapy.Item):
12 # define the fields for your item here like:
13 # name = scrapy.Field()
14 product_url = scrapy.Field() #商品url
15 product_id = scrapy.Field() #标识ID
16 product_name = scrapy.Field() #品名
17 product_price = scrapy.Field() #价格
18 store_name = scrapy.Field() #店铺名
19 store_url = scrapy.Field() #店铺url
20 crawl_time = scrapy.Field() #抓取时间
21
22 class CommentItem(scrapy.Item):
23 product_url = scrapy.Field() #商品url
24 product_id = scrapy.Field() #标识ID
25 comment_count = scrapy.Field() #评论数
26 comment_pro_type = scrapy.Field() #评论商品型号
27 comment_time = scrapy.Field() #评论时间
28 crawl_time = scrapy.Field() #抓取时间
# -*- coding: utf-8 -*-
2
3 # Define here the models for your spider middleware
4 #
5 # See documentation in:
6 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7
8 from scrapy import signals,log
9 import redis
10 import random
11 import json
12 from .userAgents import USER_AGENTS
13 from .cookies import init_cookie
14 # UserAgent中间件
15 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
16 # 重试中间件
17 from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
18
19
20 class ScrapyjdSpiderMiddleware(object):
21 # Not all methods need to be defined. If a method is not defined,
22 # scrapy acts as if the spider middleware does not modify the
23 # passed objects.
24
25 @classmethod
26 def from_crawler(cls, crawler):
27 # This method is used by Scrapy to create your spiders.
28 s = cls()
29 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
30 return s
31
32 def process_spider_input(self, response, spider):
33 # Called for each response that goes through the spider
34 # middleware and into the spider.
35
36 # Should return None or raise an exception.
37 return None
38
39 def process_spider_output(self, response, result, spider):
40 # Called with the results returned from the Spider, after
41 # it has processed the response.
42
43 # Must return an iterable of Request, dict or Item objects.
44 for i in result:
45 yield i
46
47 def process_spider_exception(self, response, exception, spider):
48 # Called when a spider or process_spider_input() method
49 # (from other spider middleware) raises an exception.
50
51 # Should return either None or an iterable of Response, dict
52 # or Item objects.
53 pass
54
55 def process_start_requests(self, start_requests, spider):
56 # Called with the start requests of the spider, and works
57 # similarly to the process_spider_output() method, except
58 # that it doesn’t have a response associated.
59
60 # Must return only requests (not items).
61 for r in start_requests:
62 yield r
63
64 def spider_opened(self, spider):
65 spider.logger.info('Spider opened: %s' % spider.name)
66
67
68 class ScrapyjdDownloaderMiddleware(object):
69 # Not all methods need to be defined. If a method is not defined,
70 # scrapy acts as if the downloader middleware does not modify the
71 # passed objects.
72
73 @classmethod
74 def from_crawler(cls, crawler):
75 # This method is used by Scrapy to create your spiders.
76 s = cls()
77 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
78 return s
79
80 def process_request(self, request, spider):
81 # Called for each request that goes through the downloader
82 # middleware.
83
84 # Must either:
85 # - return None: continue processing this request
86 # - or return a Response object
87 # - or return a Request object
88 # - or raise IgnoreRequest: process_exception() methods of
89 # installed downloader middleware will be called
90 return None
91
92 def process_response(self, request, response, spider):
93 # Called with the response returned from the downloader.
94
95 # Must either;
96 # - return a Response object
97 # - return a Request object
98 # - or raise IgnoreRequest
99 return response
100
101 def process_exception(self, request, exception, spider):
102 # Called when a download handler or a process_request()
103 # (from other downloader middleware) raises an exception.
104
105 # Must either:
106 # - return None: continue processing this exception
107 # - return a Response object: stops process_exception() chain
108 # - return a Request object: stops process_exception() chain
109 pass
110
111 def spider_opened(self, spider):
112 spider.logger.info('Spider opened: %s' % spider.name)
113
114 class UserAgentmiddleware(UserAgentMiddleware):
115 def process_request(self, request, spider):
116 agent = random.choice(USER_AGENTS)
117 # log.msg('agent : %s' % agent,level=log.INFO)
118 request.headers['User-Agent'] = agent
本文暂时没有评论,来添加一个吧(●'◡'●)