0. 爬取范围
- 全国城市
1. 程序构成
- URL管理器(模块)
- HTML解析器(模块)
- HTML下载器(模块)
- 爬虫调度器(主程序)
- 存储器(模块)
1.1 获取数据栏目(最终实现)
- 标题
- 价格
- 位置
- 评分
- 推荐标签
- 支付标签
- 简要描述
2. 获取数据方式
采集示例网址:https://www.tujia.com/detail/220623.htm
采集信息项目
对应的xpath
示例
标题
string(//div[@class='room-info']/h1)
星湖街地铁2号线月亮湾站星湖公馆一室
价格
后测试,无法通过xpath获取
358
评分
string(//div[@class='results-comments']/span[@class='hotel-value'])
5
标签
normalize-space(string(//ul[@class='pay-tag']))
免押金 通过信用认证,达标后即可凭信用免押入住支持信用免押的房屋
描述
normalize-space(string(//div[@class='unit-cont intro-cont line-dashed']/div[@class='content-box']/div[@class='desc-info']))
位于苏州独墅湖科教创新区月亮湾创苑路北、星湖街西,直通地铁2号线····
房屋编号
后测试无法获取
(房屋编号:220623)
位置
string(//div[@class='district-info']/span)
月亮湾路星湖公馆
推荐标签
//div[@class='hotel-advan-tag']/ul/li/span/text()
酒店式公寓,1室1卫,1张床,宜住2人,独立卫浴
其他推荐
无法通过xpath直接获取
https://www.tujia.com/detail/326278.htm,….
计划赶不上变化,发现房屋详情页面是通过动态加载的方式载入 价格通过再次发送请求的方式获取。
3. 直接贴代码
3.1 爬虫调度器代码
#-*- encoding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import URL_Manage
import HTML_Get_info
import HTML_Download
import time
import Save_Data_to_File
class SMan(object):
def __init__(self):
self.Manage=URL_Manage.UrlManage()
self.Download=HTML_Download.HTMLDownloader()
self.parser=HTML_Get_info.HTML_get_info()
self.output=Save_Data_to_File.DataOutPut()
def run(self,rooturl,page):
self.Manage.add_new_url(rooturl)
for i in range(1,page):
new="https://www.tujia.com/gongyu/suzhou/%d/" %i
self.Manage.add_new_url(new)
print new
self.output.create_csv()
while(self.Manage.has_new_urls() and self.Manage.old_urls_size()<200000):
try:
new_url=self.Manage.get_new_url()
if 'detail' in new_url:
html=self.Download.download(new_url)
date=self.parser.get_detial(html)
self.output.output(date)
print "详情"
else:
html=self.Download.download(new_url)
new_urls=self.parser.parser_the_list(html)
self.Manage.add_new_urls(new_urls)
time.sleep(10)
print "已经抓取了%s" %self.Manage.old_urls_size()
except:
print "run失败"
if __name__=="__main__":
sman=SMan()
sman.run("https://www.tujia.com/gongyu/suzhou/",465)
3.2 URL管理器
class UrlManage(object):
def __init__(self):
self.new_urls=set()
self.old_urls=set()
def new_urls_size(self):
return len(self.new_urls)
def old_urls_size(self):
return len(self.old_urls)
def has_new_urls(self):
return self.new_urls_size()!=0
def get_new_url(self):
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.new_urls.add(url)
3.3 HTML下载器代码
import urllib2
import urllib
import read_ips
import random
class HTMLDownloader(object):
def download(self,url):
if url is None:
return None
l_ips=read_ips.read_ips("o-ips.txt")
print l_ips
httpProxyip=random.choice(l_ips)
httpproxy_hander=urllib2.ProxyHandler(httpProxyip)
opener=urllib2.build_opener(httpproxy_hander)
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"
header={'User-Agent':user_agent}
req=urllib2.Request(url=url,headers=header)
r=opener.open(req)
if r.code==200:
return r.read()
return None
3.4 存储器代码
#-*- encoding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import pandas as pd
import numpy as np
class DataOutPut(object):
def __init__(self):
data=[]
def output(self,data):
if data is None:
return
names=["标题","价格","位置","评分","房屋编号","推荐标签","支付标签","简要描述"]
da=[[data["title"],data["price"],data["location"],data["score"],data["house_id"],data["adv_tar"],data["paytag"],data["bewrite"]]]
df=pd.DataFrame(data=da,columns=names)
df.to_csv("tujia.csv",columns=None,index=None,header=False,encoding="gbk",mode="a+")
def create_csv(self):
names = ["标题", "价格", "位置", "评分", "房屋编号", "推荐标签", "支付标签", "简要描述"]
# da = [[data["title"], data["price"], data["location"], data["score"], data["house_id"], data["adv_tar"],
# data["paytag"], data["bewrite"]]]
df = pd.DataFrame(columns=names)
df.to_csv("tujia.csv", columns=names, index=None, encoding="gbk",mode="w")
3.5 HTML解析器代码
from lxml import etree
import urllib2
import urllib
import json
import requests
class HTML_get_info(object):
def get_detial(self,content):
if content is None:
return
html=etree.HTML(content)
data={}
data['title']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='room-info']/h1)")
data['price']=self.get_price(content=html)
data['score']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='results-comments']/span[@class='hotel-value'])")
data['paytag']=self.get_element_by_xpath(html=html,xpa="normalize-space(string(//ul[@class='pay-tag']))")
data['bewrite']=self.get_element_by_xpath(html=html,xpa="normalize-space(string(//div[@class='unit-cont intro-cont line-dashed']/div[@class='content-box']/div[@class='desc-info']))")
data['house_id']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='unitCheckinTips']/div/span)")
data['location']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='district-info']/span)")
data['adv_tar']=self.get_element_by_xpath(html=html,xpa="//div[@class='hotel-advan-tag']/ul/li/span/text()")
if len(data["score"])<1:
data["score"]=0.0
return data
def get_element_by_xpath(self,html,xpa):
return html.xpath(xpa)
def parser_the_list(self,content):
html=etree.HTML(content)
next_urls=self.get_element_by_xpath(html=html,xpa="//h3/a[contains(@class,'detail')]/@href")
#print next_urls
return next_urls
def get_other(self,content):
unitid=self.get_element_by_xpath(html=content,xpa="//p[@class='link-btn-cont']/@unitid")
hotelid=self.get_element_by_xpath(html=content,xpa="//p[@class='link-btn-cont']/@hotelid")
if unitid is None and hotelid is None:
return
data='{"hotelId":%d,"unitId":%d}'%(int(hotelid[0]),int(unitid[0]))
print data
url="https://www.tujia.com/bingo/pc/unit/getOtherUnits"
header = {'Accept': 'application/json, text/plain, */*', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive',
'Content-Length': 33, 'Content-Type': 'application/json;charset=utf-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'}
req=urllib2.Request(url=url,data=data,headers=header)
r=urllib2.urlopen(req)
# print r.read()
the_json=r.read()
data_json=json.loads(the_json)
print data_json["data"]
Urls=data_json["data"]["hotelUnitsData"]["hotelUnits"]
print Urls
next_urls=[]
for u in Urls:
next_urls.append(u["unitDetailLink"])
#print u["unitDetailLink"]
return next_urls
def get_price(self,content):
unitid = self.get_element_by_xpath(html=content, xpa="//p[@class='link-btn-cont']/@unitid")
hotelid = self.get_element_by_xpath(html=content, xpa="//p[@class='link-btn-cont']/@hotelid")
if unitid is None and hotelid is None:
return
data = '{"checkInDate":"2018-02-09","checkOutDate":"2018-02-10","unitId":"%s","activityInfo":null,"callCenter":false}'%int(unitid[0])
url = "https://www.tujia.com/bingo/pc/product/getProducts"
# data = '{"checkInDate":"2018-02-09","checkOutDate":"2018-02-10","unitId":"40871","activityInfo":null,"callCenter":false}'
header = {'Accept': 'application/json, text/plain, */*', 'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=utf-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'}
req = urllib2.Request(url=url, data=data, headers=header)
r = urllib2.urlopen(req)
the_json = r.read()
if len(the_json)==128:
return
data_json = json.loads(the_json, encoding="utf-8")
price=data_json["data"]["products"][0]["productPrice"]
print price
return price
注:在这些模块里面,解析器的修改次数尤其多。HTML下载器使用的代理为随机从读取的代理ip列表中读取
附:
1. 位于下载器中的代理读取模块代码
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def read_ips(file_name):
l_ips=[]
N_ips=[]
f=open(file_name,"r")
l_ips=f.readlines()
for ip in l_ips:
N_ips.append({'http': '%s'%ip.replace("\n","").replace("\r","")})
print N_ips
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
return N_ips
2. 代理读取器读取的代理文件存储示例
118.114.77.47:8080
219.149.46.151:3129
221.231.109.40:3128
最终成果
- 途家网苏州地区公寓数据(2-9)下载 数据量:6876条
- 途家网上海地区公寓数据(2-10)下载 数据量:8602条
说明
- 以上代码100%手敲。
- 替换爬虫调度器中的连接即可抓取相关城市(所传的数字为城市公寓页面的最后一页的页码数)
- 租房价格均为基础价格(
get_price()
方法内部的data,已经被我写死收集的日期为2-9到2-10) - 评论什么的都没有抓取(评论需要通过接口POST方法发送CSV到途家的服务器)
- 关于提供下载的数据,供以后数据分析使用。
- 效率问题,目前没有使用多线程、多进程,1分钟大约45条。
- 会出现已经成功获取价格,但是在某个环节出现问题,直接报异常,提示run失败的情况。