加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
爬取数据.py 4.59 KB
一键复制 编辑 原始数据 按行查看 历史
落日不见繁华 提交于 2021-06-14 02:43 . 这是一个py文件
# -*- coding: utf-8 -*
from datetime import time
import requests
from lxml import html
import re
from pymongo import MongoClient
# import _thread
def get_areas(str):
print('start grabing areas')
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/63.0.3239.108 Safari/537.36'}
url = 'https://wh.fang.lianjia.com/loupan/' + str
res = requests.get(url, headers=headers)
context = html.etree.HTML(res.text)
# ul = context.xpath('//ul[@class="resblock-list-wrapper"]') # 获取标签,按页遍历
# for i in ul:
# li=i.xpath('.//li[@class="resblock-list post_ulog_exposure_scroll has-results"]')
ul = context.xpath('//ul[@class="resblock-list-wrapper"]') # 获取标签,按页遍历
# 楼盘列表div[@class="resblock-list-contaner clearfix"]
# /li[@class="resblock-list post_ulog_exposure_scroll has-results"]
for i in ul:#遍历每一个ul[@class="resblock-list-wrapper"下的每一个li标签
li = i.xpath('.//li[@class="resblock-list post_ulog_exposure_scroll has-results"]')
for y in li:
try:
con= y.xpath('.//div[@class="resblock-desc-wrapper"]')#在楼盘列表div[@class="resblock-list-contaner clearfix"]
# /li[@class="resblock-list post_ulog_exposure_scroll has-results"]/ul[@class="resblock-list-wrapper"]
for a in con:#遍历con
dict = {}#字典,存放key,value
dict["area1"] = a.xpath(".//div[@class='resblock-location']/span[1]/text()")
dict["area1"] = "".join(dict["area1"])
# dict["detail_area"] = dict["detail_area"][:]
print(dict["area1"])#地区
dict["title"] =a.xpath(".//div[@class='resblock-name']/a/text()")
dict["title"]="".join(dict["title"])
# dict["title"] = dict["title"][:]
print(dict["title"])
dict["area"] =a.xpath(".//div[@class='resblock-location']/span[2]/text()")
dict["area"] = "".join(dict["area"])
# dict["detail_area"] = dict["detail_area"][:]
print(dict["area"])
dict["detail_place"] = a.xpath(".//div[@class='resblock-location']/a/text()")#
dict["detail_place"] = "".join(dict["detail_place"])
# dict["detail_place"] = dict["detail_place"][:]
print(dict["detail_place"])
dict["type"] = a.xpath(".//div[@class='resblock-name']/span[1]/text()")
dict["type"] = "".join(dict["type"])#为了连接转为字符串
print(dict["type"])
try:
dict['square'] = a.xpath('.//div[@class="resblock-area"]//span//text()')
dict['square'] = "".join(dict['square'])
dict['square'] = re.findall(r"\d+-\d+" + "㎡", dict['square'])#正则取值
dict['square'] = "".join(dict['square'])#以”“将列表数据拼接成字符串·
print( dict['square'])
dict['sum_Price'] = a.xpath('.//div[@class="resblock-price"]//div[@class="second"]//text()')
dict['sum_Price'] = "".join(dict['sum_Price'])
dict['sum_Price'] = re.findall(r"\d+\.?\d+" , dict['sum_Price'])
dict['sum_Price'] = "".join(dict['sum_Price'])
print( dict['sum_Price'])
except Exception as e:
dict['square'] = ""
dict['price'] = a.xpath('.//div[@class="resblock-price"]//div[@class="main-price"]//span[1]//text()')[0]
#价格待定的楼盘设置price为0
if dict['price']=='价格待定':
dict['price'] = 0
dict['sum_Price'] = 0
dict['price']=float( dict['price'])
if dict['square'] == None:
dict['square'] = 0
list = []
list.append(dict)
client = MongoClient(host='localhost', port=27017)
db = client.get_database("lianjia")
col = db.get_collection("loupan")
col.insert_many(list)
except Exception as e:
print(res.text)
print(url)
print( 'ooops! connecting error, retrying.....')
if __name__ == '__main__':
#遍历的页数
for i in range(1, 183):
print("正在爬取第%d页" % i)
get_areas("pg" + str(i))
print("任务完成")
print("爬完")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化