爬虫12：BeautifulSoup解析

Spider 2019-01-22

BeautifulSoup解析

依赖于lxml
Anaconda Prompt:conda install beautifulsoup4

使用

from bs4 import BeautifulSoup	导入模块
soup = Beautiful(html, ‘’)	绑定
rList = soup.finf_all(条件)	对象节点对象查找

示例代码

from bs4 import BeautifulSoup

html = '''<div class="test1">hello world</div>
<div class="test1">hello universe</div>
<div class="test2">
    <span>熵限与宇宙膨胀</span>
</div>
'''
soup = BeautifulSoup(html, 'lxml')

rList = soup.find_all('div', attrs={'class':'test1'})
for r in rList:
    print(r.get_text())
    #Out: hello world
    #Out: hello universe

rList = soup.find_all('div', attrs={'class':'test2'})
for r in rList:
    # print(r.get_text())
    print(r.span.string)
    #Out: 熵限与宇宙膨胀

支持的解析库

库	特点
lxml	速度快　文档容错能力强
html.parser	python标准库　速度一般　容错能力一般
xml	速度快　文档容错能力强

没有安装lxml可以用html.parser

常用方法

命令	解释
find_all()	返回列表　Ex:rList=soup.find_all(‘div’, {‘id’:’test’})
节点对象.get_text()	获取节点下的所有文本内容，包括字节点
节点对象.string	获得节点下的文本内容

链家数据

https://xa.lianjia.com/ershoufang/

import requests
import pymongo
from bs4 import BeautifulSoup

class LianjiaSpider(object):
    def __init__(self):
        self.url = 'https://bj.lianjia.com/ershoufang/'
        self.headers = {'User-Agent':'Mozilla/5.0'}
        self.conn = pymongo.MongoClient(
                        '172.40.91.200',27017)
        self.db = self.conn['lianjiadb']
        self.myset = self.db['houseinfo']

    # 获取页面
    def getPage(self):
        res = requests.get(self.url,headers=self.headers)
        res.encoding = 'utf-8'
        html = res.text
        self.parsePage(html)

    # 解析并保存数据
    def parsePage(self,html):
        soup = BeautifulSoup(html,'lxml')
        # 找到每个房源的节点对象列表
        rList = soup.find_all('li',
            attrs={'class':'clear LOGCLICKDATA'})
        for r in rList:
            #####################################
            # 找houseInfo节点
            houseInfo = r.find('div',attrs={'class':'houseInfo'})
            infoList = houseInfo.get_text().split('/')
            # 德露苑 /2室1厅/71.4平米/南/简装/无电梯
            name = infoList[0].strip()
            huxing = infoList[1].strip()
            area = infoList[2].strip()
            ##############################################
            # positionInfo信息
            positionInfo = r.find('div',
                    attrs={'class': 'positionInfo'})
            pList = positionInfo.get_text().split('/')
            # print(pList)
            # ['低楼层(共18层)', '2000年建塔楼', '五棵松']
            floor = pList[0]
            year = pList[1]
            address = pList[2]
            #########################################
            # 单价 和 总价
            totalPrice = r.find('div',
                {'class':'totalPrice'}).get_text()
            unitPrice = r.find('div',
                {'class':'unitPrice'}).get_text()
            ########################################
            d = {
                '名称' : name,
                '户型' : huxing,
                '面积' : area,
                '楼层' : floor,
                '年份' : year,
                '地点' : address,
                '总价' : totalPrice,
                '单价' : unitPrice
            }
            self.myset.insert_one(d)

    # 主函数
    def workOn(self):
        self.getPage()

if __name__ == '__main__':
    spider = LianjiaSpider()
    spider.workOn()