【Python】爬取途家网公寓的信息

0. 爬取范围

全国城市

1. 程序构成

URL管理器（模块）
HTML解析器（模块）
HTML下载器（模块）
爬虫调度器（主程序）
存储器（模块）

1.1 获取数据栏目（最终实现）

标题
价格
位置
评分
推荐标签
支付标签
简要描述

2. 获取数据方式

采集示例网址：https://www.tujia.com/detail/220623.htm

采集信息项目

对应的xpath

示例

标题

string(//div[@class='room-info']/h1)

星湖街地铁2号线月亮湾站星湖公馆一室

价格

后测试，无法通过xpath获取

358

评分

string(//div[@class='results-comments']/span[@class='hotel-value'])

标签

normalize-space(string(//ul[@class='pay-tag']))

免押金通过信用认证，达标后即可凭信用免押入住支持信用免押的房屋

描述

normalize-space(string(//div[@class='unit-cont intro-cont line-dashed']/div[@class='content-box']/div[@class='desc-info']))

位于苏州独墅湖科教创新区月亮湾创苑路北、星湖街西，直通地铁2号线····

房屋编号

后测试无法获取

（房屋编号：220623)

位置

string(//div[@class='district-info']/span)

月亮湾路星湖公馆

推荐标签

//div[@class='hotel-advan-tag']/ul/li/span/text()

酒店式公寓,1室1卫,1张床,宜住2人,独立卫浴

其他推荐

无法通过xpath直接获取

https://www.tujia.com/detail/326278.htm,….

计划赶不上变化，发现房屋详情页面是通过动态加载的方式载入 价格通过再次发送请求的方式获取。

3. 直接贴代码

3.1 爬虫调度器代码

#-*- encoding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import URL_Manage
import HTML_Get_info
import HTML_Download
import time
import Save_Data_to_File
class SMan(object):
    def __init__(self):
        self.Manage=URL_Manage.UrlManage()
        self.Download=HTML_Download.HTMLDownloader()
        self.parser=HTML_Get_info.HTML_get_info()
        self.output=Save_Data_to_File.DataOutPut()
    def run(self,rooturl,page):
        self.Manage.add_new_url(rooturl)
        for i in range(1,page):
            new="https://www.tujia.com/gongyu/suzhou/%d/" %i
            self.Manage.add_new_url(new)
            print new
        self.output.create_csv()
        while(self.Manage.has_new_urls() and self.Manage.old_urls_size()<200000):
            try:
                new_url=self.Manage.get_new_url()
                if 'detail' in new_url:
                    html=self.Download.download(new_url)
                    date=self.parser.get_detial(html)
                    self.output.output(date)
                    print "详情"
                else:
                    html=self.Download.download(new_url)
                    new_urls=self.parser.parser_the_list(html)
                    self.Manage.add_new_urls(new_urls)
                    time.sleep(10)

                print "已经抓取了%s" %self.Manage.old_urls_size()
            except:
                print "run失败"
if __name__=="__main__":
    sman=SMan()
    sman.run("https://www.tujia.com/gongyu/suzhou/",465)

3.2 URL管理器

class UrlManage(object):
    def __init__(self):
        self.new_urls=set()
        self.old_urls=set()
    def new_urls_size(self):
        return len(self.new_urls)
    def old_urls_size(self):
        return len(self.old_urls)
    def has_new_urls(self):
        return self.new_urls_size()!=0
    def get_new_url(self):
        new_url=self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url
    def add_new_url(self,url):
        if url is None:
            return
        if url  not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self,urls):
        if urls is None or len(urls)==0:
            return
        for url in urls:
            self.new_urls.add(url)

3.3 HTML下载器代码

import urllib2
import urllib
import read_ips
import random
class HTMLDownloader(object):
    def download(self,url):
        if url is None:
            return None
        l_ips=read_ips.read_ips("o-ips.txt")
        print l_ips
        httpProxyip=random.choice(l_ips)
        httpproxy_hander=urllib2.ProxyHandler(httpProxyip)
        opener=urllib2.build_opener(httpproxy_hander)
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"
        header={'User-Agent':user_agent}
        req=urllib2.Request(url=url,headers=header)
        r=opener.open(req)
        if r.code==200:
            return r.read()
        return None

3.4 存储器代码

#-*- encoding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import pandas as pd
import numpy as np

class DataOutPut(object):
    def __init__(self):
        data=[]
    def output(self,data):
        if data is None:
            return
        names=["标题","价格","位置","评分","房屋编号","推荐标签","支付标签","简要描述"]
        da=[[data["title"],data["price"],data["location"],data["score"],data["house_id"],data["adv_tar"],data["paytag"],data["bewrite"]]]
        df=pd.DataFrame(data=da,columns=names)
        df.to_csv("tujia.csv",columns=None,index=None,header=False,encoding="gbk",mode="a+")
    def create_csv(self):
        names = ["标题", "价格", "位置", "评分", "房屋编号", "推荐标签", "支付标签", "简要描述"]
        # da = [[data["title"], data["price"], data["location"], data["score"], data["house_id"], data["adv_tar"],
        #        data["paytag"], data["bewrite"]]]
        df = pd.DataFrame(columns=names)
        df.to_csv("tujia.csv", columns=names, index=None, encoding="gbk",mode="w")

3.5 HTML解析器代码

from lxml import etree
import urllib2
import urllib
import json
import requests
class HTML_get_info(object):
    def get_detial(self,content):
        if content is None:
            return
        html=etree.HTML(content)
        data={}
        data['title']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='room-info']/h1)")
        data['price']=self.get_price(content=html)
        data['score']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='results-comments']/span[@class='hotel-value'])")
        data['paytag']=self.get_element_by_xpath(html=html,xpa="normalize-space(string(//ul[@class='pay-tag']))")
        data['bewrite']=self.get_element_by_xpath(html=html,xpa="normalize-space(string(//div[@class='unit-cont intro-cont line-dashed']/div[@class='content-box']/div[@class='desc-info']))")
        data['house_id']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='unitCheckinTips']/div/span)")
        data['location']=self.get_element_by_xpath(html=html,xpa="string(//div[@class='district-info']/span)")
        data['adv_tar']=self.get_element_by_xpath(html=html,xpa="//div[@class='hotel-advan-tag']/ul/li/span/text()")
        if len(data["score"])<1:
            data["score"]=0.0
        return data

    def get_element_by_xpath(self,html,xpa):
        return html.xpath(xpa)

    def parser_the_list(self,content):
        html=etree.HTML(content)
        next_urls=self.get_element_by_xpath(html=html,xpa="//h3/a[contains(@class,'detail')]/@href")
        #print next_urls
        return next_urls

    def get_other(self,content):
        unitid=self.get_element_by_xpath(html=content,xpa="//p[@class='link-btn-cont']/@unitid")
        hotelid=self.get_element_by_xpath(html=content,xpa="//p[@class='link-btn-cont']/@hotelid")
        if unitid is None and hotelid is None:
            return
        data='{"hotelId":%d,"unitId":%d}'%(int(hotelid[0]),int(unitid[0]))
        print data

        url="https://www.tujia.com/bingo/pc/unit/getOtherUnits"

        header = {'Accept': 'application/json, text/plain, */*', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive',
             'Content-Length': 33, 'Content-Type': 'application/json;charset=utf-8',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'}

        req=urllib2.Request(url=url,data=data,headers=header)
        r=urllib2.urlopen(req)
        # print r.read()
        the_json=r.read()
        data_json=json.loads(the_json)
        print data_json["data"]
        Urls=data_json["data"]["hotelUnitsData"]["hotelUnits"]
        print Urls
        next_urls=[]
        for u in Urls:
            next_urls.append(u["unitDetailLink"])
            #print u["unitDetailLink"]
        return next_urls

    def get_price(self,content):
        unitid = self.get_element_by_xpath(html=content, xpa="//p[@class='link-btn-cont']/@unitid")
        hotelid = self.get_element_by_xpath(html=content, xpa="//p[@class='link-btn-cont']/@hotelid")
        if unitid is None and hotelid is None:
            return
        data = '{"checkInDate":"2018-02-09","checkOutDate":"2018-02-10","unitId":"%s","activityInfo":null,"callCenter":false}'%int(unitid[0])
        url = "https://www.tujia.com/bingo/pc/product/getProducts"
       # data = '{"checkInDate":"2018-02-09","checkOutDate":"2018-02-10","unitId":"40871","activityInfo":null,"callCenter":false}'

        header = {'Accept': 'application/json, text/plain, */*', 'Cache-Control': 'max-age=0',
                  'Connection': 'keep-alive',
                   'Content-Type': 'application/json;charset=utf-8',
                  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'}
        req = urllib2.Request(url=url, data=data, headers=header)
        r = urllib2.urlopen(req)
        the_json = r.read()
        if len(the_json)==128:
            return
        data_json = json.loads(the_json, encoding="utf-8")
        price=data_json["data"]["products"][0]["productPrice"]
        print price
        return price

注：在这些模块里面，解析器的修改次数尤其多。HTML下载器使用的代理为随机从读取的代理ip列表中读取

附：

1. 位于下载器中的代理读取模块代码

import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def read_ips(file_name):
    l_ips=[]
    N_ips=[]
    f=open(file_name,"r")
    l_ips=f.readlines()
    for ip in l_ips:
        N_ips.append({'http': '%s'%ip.replace("\n","").replace("\r","")})
    print N_ips
    print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    return N_ips

2. 代理读取器读取的代理文件存储示例

118.114.77.47:8080
219.149.46.151:3129
221.231.109.40:3128

最终成果

途家网苏州地区公寓数据（2-9）下载数据量：6876条
途家网上海地区公寓数据（2-10）下载数据量：8602条

说明

以上代码100%手敲。
替换爬虫调度器中的连接即可抓取相关城市（所传的数字为城市公寓页面的最后一页的页码数）
租房价格均为基础价格（get_price(）方法内部的data,已经被我写死收集的日期为2-9到2-10）
评论什么的都没有抓取（评论需要通过接口POST方法发送CSV到途家的服务器）
关于提供下载的数据，供以后数据分析使用。
效率问题，目前没有使用多线程、多进程，1分钟大约45条。
会出现已经成功获取价格，但是在某个环节出现问题，直接报异常，提示run失败的情况。