代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*
from datetime import time
import requests
from lxml import html
import re
from pymongo import MongoClient
# import _thread
def get_areas(str):
print('start grabing areas')
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/63.0.3239.108 Safari/537.36'}
url = 'https://wh.fang.lianjia.com/loupan/' + str
res = requests.get(url, headers=headers)
context = html.etree.HTML(res.text)
# ul = context.xpath('//ul[@class="resblock-list-wrapper"]') # 获取标签,按页遍历
# for i in ul:
# li=i.xpath('.//li[@class="resblock-list post_ulog_exposure_scroll has-results"]')
ul = context.xpath('//ul[@class="resblock-list-wrapper"]') # 获取标签,按页遍历
# 楼盘列表div[@class="resblock-list-contaner clearfix"]
# /li[@class="resblock-list post_ulog_exposure_scroll has-results"]
for i in ul:#遍历每一个ul[@class="resblock-list-wrapper"下的每一个li标签
li = i.xpath('.//li[@class="resblock-list post_ulog_exposure_scroll has-results"]')
for y in li:
try:
con= y.xpath('.//div[@class="resblock-desc-wrapper"]')#在楼盘列表div[@class="resblock-list-contaner clearfix"]
# /li[@class="resblock-list post_ulog_exposure_scroll has-results"]/ul[@class="resblock-list-wrapper"]
for a in con:#遍历con
dict = {}#字典,存放key,value
dict["area1"] = a.xpath(".//div[@class='resblock-location']/span[1]/text()")
dict["area1"] = "".join(dict["area1"])
# dict["detail_area"] = dict["detail_area"][:]
print(dict["area1"])#地区
dict["title"] =a.xpath(".//div[@class='resblock-name']/a/text()")
dict["title"]="".join(dict["title"])
# dict["title"] = dict["title"][:]
print(dict["title"])
dict["area"] =a.xpath(".//div[@class='resblock-location']/span[2]/text()")
dict["area"] = "".join(dict["area"])
# dict["detail_area"] = dict["detail_area"][:]
print(dict["area"])
dict["detail_place"] = a.xpath(".//div[@class='resblock-location']/a/text()")#
dict["detail_place"] = "".join(dict["detail_place"])
# dict["detail_place"] = dict["detail_place"][:]
print(dict["detail_place"])
dict["type"] = a.xpath(".//div[@class='resblock-name']/span[1]/text()")
dict["type"] = "".join(dict["type"])#为了连接转为字符串
print(dict["type"])
try:
dict['square'] = a.xpath('.//div[@class="resblock-area"]//span//text()')
dict['square'] = "".join(dict['square'])
dict['square'] = re.findall(r"\d+-\d+" + "㎡", dict['square'])#正则取值
dict['square'] = "".join(dict['square'])#以”“将列表数据拼接成字符串·
print( dict['square'])
dict['sum_Price'] = a.xpath('.//div[@class="resblock-price"]//div[@class="second"]//text()')
dict['sum_Price'] = "".join(dict['sum_Price'])
dict['sum_Price'] = re.findall(r"\d+\.?\d+" , dict['sum_Price'])
dict['sum_Price'] = "".join(dict['sum_Price'])
print( dict['sum_Price'])
except Exception as e:
dict['square'] = ""
dict['price'] = a.xpath('.//div[@class="resblock-price"]//div[@class="main-price"]//span[1]//text()')[0]
#价格待定的楼盘设置price为0
if dict['price']=='价格待定':
dict['price'] = 0
dict['sum_Price'] = 0
dict['price']=float( dict['price'])
if dict['square'] == None:
dict['square'] = 0
list = []
list.append(dict)
client = MongoClient(host='localhost', port=27017)
db = client.get_database("lianjia")
col = db.get_collection("loupan")
col.insert_many(list)
except Exception as e:
print(res.text)
print(url)
print( 'ooops! connecting error, retrying.....')
if __name__ == '__main__':
#遍历的页数
for i in range(1, 183):
print("正在爬取第%d页" % i)
get_areas("pg" + str(i))
print("任务完成")
print("爬完")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。