加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
爬虫大作业.py 12.20 KB
一键复制 编辑 原始数据 按行查看 历史
guetqhw 提交于 2024-04-22 14:40 . add 爬虫大作业.py.
import requests
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import queue
import re
import pandas as pd
import openpyxl
import pymysql #导入到数据库
import os
import xlwt
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
#以上的第三方库要自己下
#####################这一部分是爬虫部分#########
header = {
"User-Agent": UserAgent().chrome
}
img_queue = queue.Queue()
name_queue = queue.Queue()
area_queue = queue.Queue()
type_queue = queue.Queue()
address_queue = queue.Queue()
fangyuan_queue = queue.Queue()
price1_queue = queue.Queue()
price2_queue = queue.Queue()
tel_queue = queue.Queue()
for i in range(1, 14):
response = requests.get(url=f"https://guilin.newhouse.fang.com/house/s/b9{i}/", headers=header)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
h1 = soup.find_all("div", attrs={"class": "clearfix"})
for h2 in h1:
# 照片
h2_img = h2.find_all("img", attrs={"width": "168", "height": "112"})
for img in h2_img:
url_img = "https:" + img['src']
img_queue.put(url_img)
# 房屋名称
h2_house = h2.find_all("div", attrs={"class": "nlcd_name"})
for temp in h2_house:
h2_house_name = temp.find("a")
name = h2_house_name.string.strip()
name_queue.put(name)
# 房屋面积
h2_house_area = h2.find_all("div", attrs={"class": "house_type"})
for temp in h2_house_area:
area = temp.contents[-1].strip()
area_queue.put(area)
h2_house_type = temp.find_all("a")
house_type = ""
for temp in h2_house_type:
house_type += temp.string.strip() + "/"
type_queue.put(house_type)
# 房屋地址
h2_house_address1 = h2.find_all("div", attrs={"class": "address"})
for temp in h2_house_address1:
temp1 = temp.find_all("a")
address_parts = []
for temp2 in temp1:
address_parts.append(temp2.text.strip())
address = " ".join(address_parts).split()
address = "".join(address)
address_queue.put(address)
# 房源信息
h2_fangyuan = h2.find_all("div", attrs={"class": "fangyuan"})
for temp in h2_fangyuan:
text = temp.text.replace(" ", "/").replace("\n", "/").replace("\r", "/")
words = text.split()
joined_text = ''.join(words)
joined_text = joined_text.replace("∨", "") # 删除字母"v"
joined_text = re.sub(r"/+", "/", joined_text) # 将多个斜杠替换为一个斜杠
joined_text = re.sub(r"\s+", "", joined_text) # 删除空格和换行符
joined_text = joined_text.strip("/")
fangyuan_queue.put(joined_text)
# 联系电话
h2_tel = h2.find_all("div", attrs={"class": "tel"})
for temp in h2_tel:
temp1 = temp.find_all("p")
for temp2 in temp1:
tel_queue.put(temp2.text)
# 房屋价格
h2_price1 = h2.find_all("div", attrs={"class": "nhouse_price"})
for temp in h2_price1:
text = temp.text
text = text.split()
price1_queue.put(text[0])
h2_price2 = h2.find_all("p", attrs={"class": "zj_price"})
for temp in h2_price2:
price2_queue.put(temp.text)
print(f"爬取第{i }次")
time.sleep(1)
print("爬取完成")
img_list = list(img_queue.queue)
name_list = list(name_queue.queue)
area_list = list(area_queue.queue)
type_list = list(type_queue.queue)
address_list = list(address_queue.queue)
fangyuan_list = list(fangyuan_queue.queue)
price1_list = list(price1_queue.queue)
price2_list = list(price2_queue.queue)
tel_list = list(tel_queue.queue)
########这里是开始创建excel表##############
# 创建excel表
workbook = xlwt.Workbook(encoding='utf-8')
data_sheet = workbook.add_sheet('房源信息', cell_overwrite_ok=True)
# 定义字体
font = xlwt.Font()
font.name = 'name Times New Roman'
font.height = 20 * 12
font.bold = False
# 设置单元格对齐方式
alignment = xlwt.Alignment()
alignment.horz = 0x02
alignment.vert = 0x00
alignment.wrap = 1
# 设置列宽
data_sheet.col(0).width = 256 * 20
data_sheet.col(0).height = 256 * 40
for i in range(1, 10):
data_sheet.col(i).width = 256 * 20
# 初始化样式
my_style = xlwt.XFStyle()
my_style.font = font
my_style.alignment = alignment
# 定义表头信息
info = ['图片', '楼盘', '面积', '类型', '地址', '房源', '单价', '总价', '电话']
# 开始向表格输入内容
for i in range(len(info)):
data_sheet.write(0, i, info[i], my_style)
# 确定列表的最小长度
list_length = min(100, len(name_list), len(area_list), len(type_list), len(address_list),
len(fangyuan_list), len(price1_list), len(price2_list), len(tel_list))
# 写入数据
data_lists = [name_list, area_list, type_list, address_list, fangyuan_list, price1_list, price2_list, tel_list]
for col, data_list in enumerate(data_lists, start=1):
for row in range(list_length):
data_sheet.write(row+1, col, data_list[row], my_style)
file_path = r"C:\Users\Qhw\Desktop\桂林房地产数据信息.xls" #存放路径
workbook.save(file_path)
if os.path.exists(file_path):
print("Excel创建成功")
else:
print("Excel创建失败")
###############这一部分为导入数据库部分#########
def connect_to_database():
host = '127.0.0.1' # 数据库服务器地址 公网;本地的就用127.0.0.1
user = 'root' # 数据库用户名
password = '123mysql' # 填写自己的数据库密码
port = 3306 # 默认数据库端口号
database = 'mydata' # 这里改为要连接的数据库名称
try:
conn = pymysql.connect(
host=host,
user=user,
password=password,
port=port,
database=database,
charset='utf8mb4'
)
print("成功连接数据库")
return conn # 返回数据库连接对象
except pymysql.MySQLError as e:
print(f"连接数据库出现错误:{e}")
return None
# 将 Excel 数据插入到 MySQL 数据库中
def insert_excel_data_into_mysql(conn, excel_data):
if conn is not None:
cursor = conn.cursor() # 创建游标对象
try:
# 准备 SQL 插入语句
sql = "INSERT INTO 桂林房地产 (图片, 楼盘, 面积, 类型, 地址, 房源, 单价, 总价, 电话) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
# 执行 SQL 语句,插入数据
for index, row in excel_data.iterrows():
cursor.execute(sql, (
row['图片'], row['楼盘'], row['面积'], row['类型'],
row['地址'], row['房源'], row['单价'], row['总价'], row['电话']
))
conn.commit() # 提交事务
print("成功插入表格")
except pymysql.MySQLError as e:
print(f"插入数据出现错误:{e}")
finally:
cursor.close() # 关闭游标
conn.close() # 关闭连接
else:
print("未能连接到数据库")
if __name__ == "__main__":
# 读取 Excel 文件数据并处理 NaN 值
excel_data = pd.read_excel(r"C:\Users\Qhw\Desktop\桂林房地产数据信息.xls") #这里是要读取的本地文件
excel_data.fillna(value='', inplace=True) # 将 NaN 值替换为空字符串
# 连接到数据库并执行插入操作
connection = connect_to_database()
insert_excel_data_into_mysql(connection, excel_data)
# ########这里是开始进行数据的分析##############
#提取单价和总价中的数值部分便于后续的排序
# 提取数值部分的正则表达式
pattern = r'(\d+(\.\d+)?)' # 匹配一个或多个数字,包括可能的小数部分
# 提取每个单价字符串中的数值部分,并且转化为小数(单位 元/平方米)
unit_price = []
for string in price1_list:
match = re.search(pattern, string)
if match:
numeric_value = match.group(1)
unit_price.append(float(numeric_value)) # 将提取的数值部分转换为2位小数
# 提取每个总价字符串中的数值部分,并且转化为小数(单位 万元/套)
total_price = []
for string in price2_list:
match = re.search(pattern, string)
if match:
numeric_value = match.group(1)
total_price.append(float(numeric_value))
regions_list = []
# 遍历原始字符串数组,提取内容并添加到新数组中,将里面的地区提取出来
for s in address_list:
start_idx = s.find("[") # 查找【的起始位置
end_idx = s.find("]") # 查找】的起始位置
if start_idx != -1 and end_idx != -1:
content = s[start_idx + 1:end_idx] # 提取中括号内的内容
regions_list.append(content)
# 打印新数组
#构建字典 字典中包含:key:小区名称 value:小区单价和小区的总价
house_list = {name:(unit_price,total_price,regions) for name,unit_price,total_price,regions in zip(name_list,unit_price,total_price,regions_list)}
unit_price_list = dict(sorted(house_list.items(), key=lambda x: x[1][0], reverse=True))
total_price_list = dict(sorted(house_list.items(), key=lambda x: x[1][1], reverse=True))
print(unit_price_list)
# print(total_price_list)
regions_price_list ={}
regions_number_list = {}
for values in unit_price_list.items():
# for unit,total,region in values[1]:
if values[1][2] not in regions_price_list:
regions_price_list[values[1][2]] = [0,0]
# regions_number_list[values[1][2]] = 0
# 将小区的单价加到对应的区中
regions_price_list[values[1][2]][0] += values[1][0]
regions_price_list[values[1][2]][1] += 1
# regions_price_list[values[1][2]] = 1 + regions_price_list[values[1][2]]
regions_average_list = {}
for values in regions_price_list.items():
if values[0] not in regions_average_list:
regions_average_list[values[0]] = round(float(values[1][0] / values[1][1]),3)
if regions_average_list[values[0]] < 1:
del regions_average_list[values[0]]
print(regions_average_list)
regions_averageSorted_list = dict(sorted(regions_average_list.items(), key=lambda item: item[1]))
# 针对单价和总价进行画图
keys = list(unit_price_list.keys())
unit_values = [value[0] for value in unit_price_list.values()]
#显示中文字符
plt.rcParams['font.sans-serif'] = ['SimHei'] # 'SimHei'是一种常用的中文字体window用这个 macOS用这个Arial Unicode MS
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
# 创建水平柱状图
# 计算需要的画布高度
number_of_items = len(unit_price_list)
height_per_item = 0.2 # 每个项目的高度,可以根据需要调整
fig_height = number_of_items * height_per_item
fig1, ax1 = plt.subplots(figsize=(8, max(5, fig_height)))
ax1.barh(keys, unit_values, color='blue')
ax1.set_title("桂林各个小区单价")
ax1.set_ylabel("小区名称")
ax1.set_xlabel("小区单价(单位:元/平方米)")
fig1.savefig('unit_price_list.png')
total_values = [value[1] for value in unit_price_list.values()]
fig2, ax2 = plt.subplots(figsize=(8, max(5, fig_height)))
ax2.barh(keys, unit_values, color='blue')
ax2.set_title("桂林各个小区总价")
ax2.set_ylabel("小区名称")
ax2.set_xlabel("小区单价(单位:万元/套)")
fig2.savefig('total_price_list.png')
regions_value =list(regions_averageSorted_list.values())
regions_keys = list(regions_averageSorted_list.keys())
number_of_regions = len(regions_averageSorted_list)
regions_height = number_of_items * height_per_item
fig3, ax3 = plt.subplots(figsize=(8, max(5, fig_height)))
ax3.barh(regions_keys, regions_value, color='orange')
ax3.set_title("桂林各个区单价情况")
ax3.set_ylabel("区名称")
ax3.set_xlabel("小区单价(单位:元/平方米)")
fig3.savefig('regions_price_list.png')
plt.show()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化