分布式爬虫 Scrapy-Redis案例:搜房网房天下

创建爬虫

scrapy startproject soufang

cd soufang

scrapy gensipder sfw “fang.com”

爬虫代码

# -*- coding: utf-8 -*-
import scrapy
import re
from pa_chong.Scrapy.soufang.soufang.items import NewHouseItem, EsfItem
from scrapy_redis.spiders import RedisSpider
class SfwSpider(RedisSpider):    # 改成继承自RedisSpider
name = 'sfw'
allowed_domains = ['fang.com']
# start_urls = ['http://www.fang.com/SoufunFamily.htm']
redis_key = 'fang:start_url'    # 开始url不是start_urls了,而是Redis里传入的key
# fang:start_url是一个标识,叫什么无所谓
def parse(self, response):  # 解析所有城市url,构建每个城市的新房、二手房url
trs = response.xpath('//div[@id="c02"]//tr')  # 获取所有tr标签
province = None
for tr in trs:
tds = tr.xpath('.//td[not(@class)]')  # 获取tr标签下的所有td标签(省份和城市)
province_td = tds[0]                  # 获取省份的td标签
province_text = province_td.xpath('.//text()').get()  # 获取省份
province_text = re.sub(r'\s', '', province_text)      # 去除空白文本
if province_text:   # 去除空白后如果还有province_text就说明它获取到了新的省份文本
province = province_text    # 赋值新的省份,如果没有就继续用原来的
# 不爬取海外的房源
if province == '其它':
continue
city_td = tds[1]                      # 获取城市的td标签
city_links = city_td.xpath('.//a')    # 获取城市
for city_link in city_links:          # 遍历出每一个城市
city = city_link.xpath('.//text()').get()     # 拿到城市名字
city_url = city_link.xpath('.//@href').get()  # 拿到城市的url
url_module = city_url.split('.')   # 分割城市域名,例:http://anqing.fang.com
if 'bj' in url_module[0]:     # 北京比较特殊单独处理
newhouse_url = 'http://newhouse.fang.com/house/s'
esf_url = 'http://esf.fang.com'
elif len(url_module) == 3:
# 构建新房url链接,例:http://anqing.newhouse.fang.com/house/s
newhouse_url = url_module[0] + '.newhouse.fang.com/house/s'
# 构建二手房url链接,例:http://anqing.esf.fang.com
esf_url = url_module[0] + '.esf.fang.com'
# print(f'省份-城市:{province}-{city}')
# print(f'新房链接:{newhouse_url}')
# print(f'二手房链接:{esf_url}')
yield scrapy.Request(newhouse_url, callback=self.parse_newhouse, meta={ 'info': (province, city)})
yield scrapy.Request(esf_url, callback=self.parse_esf, meta={ 'info': (province, city)})
# meta={'info': (province, city)} 是将获取到的省份和城市这两个数据传给回调函数,供response调用
def parse_newhouse(self, response):                      # 解析新房信息
province, city = response.meta.get('info')           # 解包省份和城市
lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
# contains是查找包含某个值的某个属性(模糊匹配)
for li in lis:
name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get().strip()
print(name)
house_list = li.xpath('.//div[contains(@class,"house_type")]/a/text()').getall()
house_list = list(map(lambda x: re.sub(r'\s', '', x), house_list))  # 过滤空白字符
rooms = list(filter(lambda x: x.endswith('居'), house_list))   # 筛选只保留以居结尾的
area = ''.join(li.xpath('.//div[contains(@class,"house_type")]/text()').getall())
# 使用get()返回的是第一个文本,第一个文本是个符号,
# 所以这里用getall()获取所有的文本并返回列表,
# 再''.join()把列表转换成字符串
area = re.sub(r'-|\s|/', '', area)  # 再去除这些无意义的符号
address = li.xpath('.//div[@class="address"]/a/@title').get()
district_text = ''.join(li.xpath('.//div[@class="address"]/a//text()').getall())
district = re.search(r'.*\[(.+)\].*', district_text).group(1)
sale = li.xpath('.//div[contains(@class,"fangyuan")]/span/text()').get()
price = ''.join(li.xpath('.//div[@class="nhouse_price"]//text()').getall())
price = re.sub(r'\s|广告', '', price)
origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
origin_url = response.urljoin(origin_url)
item = NewHouseItem(province=province, city=city, name=name, rooms=rooms,
area=area, address=address, district=district,
sale=sale, price=price, origin_url=origin_url)
yield item
next_url = response.xpath('//div[@class="page"]//a[@class="next"]/href').get()
if next_url:
# 如果下一页的按钮存在的话就说明还有下一页,就yield scrapy.Request调用自身函数为回调函数
yield scrapy.Request(response.urljoin(next_url), callback=self.parse_newhouse, meta={ 'info': (province, city)})
# response.urljoin(next_url) 等于 response.url + next_url
def parse_esf(self, response):                          # 解析二手房信息
province, city = response.meta.get('info')          # 解包省份和城市
dls = response.xpath('//div[contains(@class,"shop_list")]/dl')
for dl in dls:
item = EsfItem(province=province, city=city)
item['name'] = dl.xpath('.//p[@class="add_shop"]/a/text()').get().strip()
print(item['name'])
infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
infos = list(map(lambda x: re.sub(r'\s', '', x), infos))
# print(infos)
for info in infos:
if '厅' in info:
item['rooms'] = info
elif '㎡' in info:
item['area'] = info
elif '层' in info:
item['floor'] = info
elif '向' in info:
item['toward'] = info
elif '年' in info:
item['year'] = info
item['address'] = dl.xpath('.//p[@class="add_shop"]/span/text()').get()
item['price'] = ''.join(dl.xpath('.//dd[@class="price_right"]/span[1]//text()').getall())
item['unit'] = ''.join(dl.xpath('.//dd[@class="price_right"]/span[2]//text()').getall())
item['origin_url'] = response.urljoin(dl.xpath('.//h4[@class="clearfix"]/a/@href').get())
yield item
next_url = response.xpath('//div[@class="page_al"]/p[last()-2]/a/@href').get()
if next_url:
yield scrapy.Request(response.urljoin(next_url), callback=self.parse_esf, meta={ 'info': (province, city)})

items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NewHouseItem(scrapy.Item):              # 新房
province = scrapy.Field()    # 省份
city = scrapy.Field()        # 城市
name = scrapy.Field()        # 小区名字
price = scrapy.Field()       # 价格
rooms = scrapy.Field()       # 几居 多个(列表)
area = scrapy.Field()        # 面积
address = scrapy.Field()     # 地址
district = scrapy.Field()    # 行政区
sale = scrapy.Field()        # 是否在售
origin_url = scrapy.Field()  # 详情页url
class EsfItem(scrapy.Item):                  # 二手房
province = scrapy.Field()    # 省份
city = scrapy.Field()        # 城市
name = scrapy.Field()        # 小区名字
rooms = scrapy.Field()       # 几室几厅
floor = scrapy.Field()       # 层
toward = scrapy.Field()      # 朝向
year = scrapy.Field()        # 年代
address = scrapy.Field()     # 地址
area = scrapy.Field()        # 面积
price = scrapy.Field()       # 总价
unit = scrapy.Field()        # 单价
origin_url = scrapy.Field()  # 详情页url

middlewares.py

# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
class UserAgentDownloadMiddleware():
# user-agent随机请求头
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.60'
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
# print(request.headers['User-Agent'])

settings.py

是否遵守机器人协议:ROBOTSTXT_OBEY = False
延迟:DOWNLOAD_DELAY
请求头:DEFAULT_REQUEST_HEADERS
下载器中间件:DOWNLOADER_MIDDLEWARES

ITEM_PIPELINES使用Redis的

ITEM_PIPELINES使用Redis的

# Scrapy-Redis相关配置
# 确保request存储到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有爬虫共享相同的去重指纹
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置redis为item pipeline
ITEM_PIPELINES = { 
'scrapy_redis.pipelines.RedisPipeline': 300
}
# 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列,从而可以实现暂停和恢复的功能。
SCHEDULER_PERSIST = True
# 设置连接redis信息
REDIS_HOST = '192.168.1.104'
REDIS_PORT = 6379

Scrapy-Redis相关配置,都是固定格式,直接复制粘贴即可,将REDIS_HOST改为自己电脑外网ip就行了

pipelines.py

配置redis为item pipeline后,pipelines.py就无效了,数据是直接走Redis的

存储为本地json文件:

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class SoufangPipeline(object):
def __init__(self):
self.newhouse_fp = open('newhouse.json', 'wb')
self.esfhouse_fp = open('esfhouse.json', 'wb')
self.newhouse = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)
self.esfhouse = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False)
def process_item(self, item, spider):
self.newhouse.export_item(item)
self.esfhouse.export_item(item)
return item
def close_spider(self, spider):
self.newhouse_fp.close()
self.esfhouse_fp.close()

执行分布式爬虫

先连接Redis,然后在爬虫服务器中执行爬虫

最后再推入url,爬虫服务器立马就开始爬取了

redis-cli> lpush [redis_key] start_url

本文地址:https://blog.csdn.net/weixin_43040873/article/details/111082689

(0)
上一篇 2022年3月21日
下一篇 2022年3月21日

相关推荐