编程开源技术交流,分享技术与知识

网站首页 > 开源技术 正文

使用cookies京东商城网络爬虫包括用户代理(附项目下载地址)

wxchong 2024-09-06 01:06:17 开源技术 11 ℃ 0 评论

项目地址:

https://github.com/Glf9832/ScrapyJingDong.git

主要代码:

# -*- coding: utf-8 -*-

2

3 # Define here the models for your scraped items

4 #

5 # See documentation in:

6 # https://doc.scrapy.org/en/latest/topics/items.html

7

8 import scrapy

9

10

11 class ScrapyjdItem(scrapy.Item):

12 # define the fields for your item here like:

13 # name = scrapy.Field()

14 product_url = scrapy.Field() #商品url

15 product_id = scrapy.Field() #标识ID

16 product_name = scrapy.Field() #品名

17 product_price = scrapy.Field() #价格

18 store_name = scrapy.Field() #店铺名

19 store_url = scrapy.Field() #店铺url

20 crawl_time = scrapy.Field() #抓取时间

21

22 class CommentItem(scrapy.Item):

23 product_url = scrapy.Field() #商品url

24 product_id = scrapy.Field() #标识ID

25 comment_count = scrapy.Field() #评论数

26 comment_pro_type = scrapy.Field() #评论商品型号

27 comment_time = scrapy.Field() #评论时间

28 crawl_time = scrapy.Field() #抓取时间


# -*- coding: utf-8 -*-

2

3 # Define here the models for your spider middleware

4 #

5 # See documentation in:

6 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html

7

8 from scrapy import signals,log

9 import redis

10 import random

11 import json

12 from .userAgents import USER_AGENTS

13 from .cookies import init_cookie

14 # UserAgent中间件

15 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

16 # 重试中间件

17 from scrapy.downloadermiddlewares.redirect import RedirectMiddleware

18

19

20 class ScrapyjdSpiderMiddleware(object):

21 # Not all methods need to be defined. If a method is not defined,

22 # scrapy acts as if the spider middleware does not modify the

23 # passed objects.

24

25 @classmethod

26 def from_crawler(cls, crawler):

27 # This method is used by Scrapy to create your spiders.

28 s = cls()

29 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

30 return s

31

32 def process_spider_input(self, response, spider):

33 # Called for each response that goes through the spider

34 # middleware and into the spider.

35

36 # Should return None or raise an exception.

37 return None

38

39 def process_spider_output(self, response, result, spider):

40 # Called with the results returned from the Spider, after

41 # it has processed the response.

42

43 # Must return an iterable of Request, dict or Item objects.

44 for i in result:

45 yield i

46

47 def process_spider_exception(self, response, exception, spider):

48 # Called when a spider or process_spider_input() method

49 # (from other spider middleware) raises an exception.

50

51 # Should return either None or an iterable of Response, dict

52 # or Item objects.

53 pass

54

55 def process_start_requests(self, start_requests, spider):

56 # Called with the start requests of the spider, and works

57 # similarly to the process_spider_output() method, except

58 # that it doesn’t have a response associated.

59

60 # Must return only requests (not items).

61 for r in start_requests:

62 yield r

63

64 def spider_opened(self, spider):

65 spider.logger.info('Spider opened: %s' % spider.name)

66

67

68 class ScrapyjdDownloaderMiddleware(object):

69 # Not all methods need to be defined. If a method is not defined,

70 # scrapy acts as if the downloader middleware does not modify the

71 # passed objects.

72

73 @classmethod

74 def from_crawler(cls, crawler):

75 # This method is used by Scrapy to create your spiders.

76 s = cls()

77 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

78 return s

79

80 def process_request(self, request, spider):

81 # Called for each request that goes through the downloader

82 # middleware.

83

84 # Must either:

85 # - return None: continue processing this request

86 # - or return a Response object

87 # - or return a Request object

88 # - or raise IgnoreRequest: process_exception() methods of

89 # installed downloader middleware will be called

90 return None

91

92 def process_response(self, request, response, spider):

93 # Called with the response returned from the downloader.

94

95 # Must either;

96 # - return a Response object

97 # - return a Request object

98 # - or raise IgnoreRequest

99 return response

100

101 def process_exception(self, request, exception, spider):

102 # Called when a download handler or a process_request()

103 # (from other downloader middleware) raises an exception.

104

105 # Must either:

106 # - return None: continue processing this exception

107 # - return a Response object: stops process_exception() chain

108 # - return a Request object: stops process_exception() chain

109 pass

110

111 def spider_opened(self, spider):

112 spider.logger.info('Spider opened: %s' % spider.name)

113

114 class UserAgentmiddleware(UserAgentMiddleware):

115 def process_request(self, request, spider):

116 agent = random.choice(USER_AGENTS)

117 # log.msg('agent : %s' % agent,level=log.INFO)

118 request.headers['User-Agent'] = agent


Tags:

本文暂时没有评论,来添加一个吧(●'◡'●)

欢迎 发表评论:

最近发表
标签列表