master

分支 (2)

管理

管理

master

new_branch

crawler-analysis-storage-task
/
爬虫大作业.py

import requests
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import queue
import re
import pandas as pd
import openpyxl
import pymysql    #导入到数据库
import os
import xlwt
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt


#以上的第三方库要自己下

#####################这一部分是爬虫部分#########
header = {
    "User-Agent": UserAgent().chrome
}
img_queue = queue.Queue()
name_queue = queue.Queue()
area_queue = queue.Queue()
type_queue = queue.Queue()
address_queue = queue.Queue()
fangyuan_queue = queue.Queue()
price1_queue = queue.Queue()
price2_queue = queue.Queue()
tel_queue = queue.Queue()

for i in range(1, 14):
    response = requests.get(url=f"https://guilin.newhouse.fang.com/house/s/b9{i}/", headers=header)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, "html.parser")
    h1 = soup.find_all("div", attrs={"class": "clearfix"})

    for h2 in h1:
        # 照片
        h2_img = h2.find_all("img", attrs={"width": "168", "height": "112"})
        for img in h2_img:
            url_img = "https:" + img['src']
            img_queue.put(url_img)

        # 房屋名称
        h2_house = h2.find_all("div", attrs={"class": "nlcd_name"})
        for temp in h2_house:
            h2_house_name = temp.find("a")
            name = h2_house_name.string.strip()
            name_queue.put(name)

        # 房屋面积
        h2_house_area = h2.find_all("div", attrs={"class": "house_type"})
        for temp in h2_house_area:
            area = temp.contents[-1].strip()
            area_queue.put(area)

            h2_house_type = temp.find_all("a")
            house_type = ""
            for temp in h2_house_type:
                house_type += temp.string.strip() + "/"
            type_queue.put(house_type)

        # 房屋地址
        h2_house_address1 = h2.find_all("div", attrs={"class": "address"})
        for temp in h2_house_address1:
            temp1 = temp.find_all("a")
            address_parts = []
            for temp2 in temp1:
                address_parts.append(temp2.text.strip())
            address = " ".join(address_parts).split()
            address = "".join(address)
            address_queue.put(address)

        # 房源信息
        h2_fangyuan = h2.find_all("div", attrs={"class": "fangyuan"})
        for temp in h2_fangyuan:
            text = temp.text.replace(" ", "/").replace("\n", "/").replace("\r", "/")
            words = text.split()
            joined_text = ''.join(words)
            joined_text = joined_text.replace("∨", "")  # 删除字母"v"
            joined_text = re.sub(r"/+", "/", joined_text)  # 将多个斜杠替换为一个斜杠
            joined_text = re.sub(r"\s+", "", joined_text)  # 删除空格和换行符
            joined_text = joined_text.strip("/")
            fangyuan_queue.put(joined_text)

        # 联系电话
        h2_tel = h2.find_all("div", attrs={"class": "tel"})
        for temp in h2_tel:
            temp1 = temp.find_all("p")
            for temp2 in temp1:
                tel_queue.put(temp2.text)

        # 房屋价格
        h2_price1 = h2.find_all("div", attrs={"class": "nhouse_price"})
        for temp in h2_price1:
            text = temp.text
            text = text.split()
            price1_queue.put(text[0])

        h2_price2 = h2.find_all("p", attrs={"class": "zj_price"})
        for temp in h2_price2:
            price2_queue.put(temp.text)

    print(f"爬取第{i  }次")
    time.sleep(1)

print("爬取完成")

img_list = list(img_queue.queue)
name_list = list(name_queue.queue)
area_list = list(area_queue.queue)
type_list = list(type_queue.queue)
address_list = list(address_queue.queue)
fangyuan_list = list(fangyuan_queue.queue)
price1_list = list(price1_queue.queue)
price2_list = list(price2_queue.queue)
tel_list = list(tel_queue.queue)

########这里是开始创建excel表##############

# 创建excel表
workbook = xlwt.Workbook(encoding='utf-8')
data_sheet = workbook.add_sheet('房源信息', cell_overwrite_ok=True)

# 定义字体
font = xlwt.Font()
font.name = 'name Times New Roman'
font.height = 20 * 12
font.bold = False

# 设置单元格对齐方式
alignment = xlwt.Alignment()
alignment.horz = 0x02
alignment.vert = 0x00
alignment.wrap = 1

# 设置列宽
data_sheet.col(0).width = 256 * 20
data_sheet.col(0).height = 256 * 40
for i in range(1, 10):
    data_sheet.col(i).width = 256 * 20

# 初始化样式
my_style = xlwt.XFStyle()
my_style.font = font
my_style.alignment = alignment

# 定义表头信息
info = ['图片', '楼盘', '面积', '类型', '地址', '房源', '单价', '总价', '电话']

# 开始向表格输入内容
for i in range(len(info)):
    data_sheet.write(0, i, info[i], my_style)

# 确定列表的最小长度
list_length = min(100, len(name_list), len(area_list), len(type_list), len(address_list),
                  len(fangyuan_list), len(price1_list), len(price2_list), len(tel_list))

# 写入数据
data_lists = [name_list, area_list, type_list, address_list, fangyuan_list, price1_list, price2_list, tel_list]

for col, data_list in enumerate(data_lists, start=1):
    for row in range(list_length):
        data_sheet.write(row+1, col, data_list[row], my_style)

file_path = r"C:\Users\Qhw\Desktop\桂林房地产数据信息.xls"   #存放路径
workbook.save(file_path)

if os.path.exists(file_path):
    print("Excel创建成功")
else:
    print("Excel创建失败")
###############这一部分为导入数据库部分#########
def connect_to_database():
    host = '127.0.0.1'  # 数据库服务器地址 公网；本地的就用127.0.0.1
    user = 'root'      # 数据库用户名
    password = '123mysql'    # 填写自己的数据库密码
    port = 3306     # 默认数据库端口号
    database = 'mydata'       # 这里改为要连接的数据库名称

    try:
        conn = pymysql.connect(
            host=host,
            user=user,
            password=password,
            port=port,
            database=database,
            charset='utf8mb4'
        )
        print("成功连接数据库")
        return conn  # 返回数据库连接对象
    except pymysql.MySQLError as e:
        print(f"连接数据库出现错误：{e}")
        return None

# 将 Excel 数据插入到 MySQL 数据库中
def insert_excel_data_into_mysql(conn, excel_data):
    if conn is not None:
        cursor = conn.cursor()  # 创建游标对象

        try:
            # 准备 SQL 插入语句
            sql = "INSERT INTO 桂林房地产 (图片, 楼盘, 面积, 类型, 地址, 房源, 单价, 总价, 电话) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"

            # 执行 SQL 语句，插入数据
            for index, row in excel_data.iterrows():
                cursor.execute(sql, (
                    row['图片'], row['楼盘'], row['面积'], row['类型'],
                    row['地址'], row['房源'], row['单价'], row['总价'], row['电话']
                ))

            conn.commit()  # 提交事务
            print("成功插入表格")
        except pymysql.MySQLError as e:
            print(f"插入数据出现错误：{e}")
        finally:
            cursor.close()  # 关闭游标
            conn.close()  # 关闭连接
    else:
        print("未能连接到数据库")

if __name__ == "__main__":
    # 读取 Excel 文件数据并处理 NaN 值
    excel_data = pd.read_excel(r"C:\Users\Qhw\Desktop\桂林房地产数据信息.xls")   #这里是要读取的本地文件
    excel_data.fillna(value='', inplace=True)  # 将 NaN 值替换为空字符串

    # 连接到数据库并执行插入操作
    connection = connect_to_database()
    insert_excel_data_into_mysql(connection, excel_data)


# ########这里是开始进行数据的分析##############

#提取单价和总价中的数值部分便于后续的排序

# 提取数值部分的正则表达式
pattern = r'(\d+(\.\d+)?)'  # 匹配一个或多个数字，包括可能的小数部分
# 提取每个单价字符串中的数值部分，并且转化为小数(单位 元/平方米)
unit_price = []
for string in price1_list:
    match = re.search(pattern, string)
    if match:
        numeric_value = match.group(1)
        unit_price.append(float(numeric_value)) # 将提取的数值部分转换为2位小数


# 提取每个总价字符串中的数值部分，并且转化为小数(单位 万元/套)
total_price = []
for string in price2_list:
    match = re.search(pattern, string)
    if match:
        numeric_value = match.group(1)
        total_price.append(float(numeric_value))

regions_list = []
# 遍历原始字符串数组，提取内容并添加到新数组中,将里面的地区提取出来
for s in address_list:
    start_idx = s.find("[")  # 查找【的起始位置
    end_idx = s.find("]")  # 查找】的起始位置
    if start_idx != -1 and end_idx != -1:
        content = s[start_idx + 1:end_idx]  # 提取中括号内的内容
        regions_list.append(content)

# 打印新数组

#构建字典 字典中包含：key:小区名称 value：小区单价和小区的总价
house_list = {name:(unit_price,total_price,regions) for name,unit_price,total_price,regions in zip(name_list,unit_price,total_price,regions_list)}
unit_price_list = dict(sorted(house_list.items(), key=lambda x: x[1][0], reverse=True))
total_price_list = dict(sorted(house_list.items(), key=lambda x: x[1][1], reverse=True))
print(unit_price_list)
# print(total_price_list)

regions_price_list ={}
regions_number_list = {}
for values in unit_price_list.items():
    # for unit,total,region in values[1]:
    if values[1][2] not in regions_price_list:
        regions_price_list[values[1][2]] = [0,0]
        # regions_number_list[values[1][2]] = 0

    # 将小区的单价加到对应的区中
    regions_price_list[values[1][2]][0] += values[1][0]
    regions_price_list[values[1][2]][1] += 1
    # regions_price_list[values[1][2]] = 1 + regions_price_list[values[1][2]]
regions_average_list = {}
for values in regions_price_list.items():
    if values[0] not in regions_average_list:
        regions_average_list[values[0]] = round(float(values[1][0] / values[1][1]),3)
        if regions_average_list[values[0]] < 1:
            del regions_average_list[values[0]]
print(regions_average_list)
regions_averageSorted_list = dict(sorted(regions_average_list.items(), key=lambda item: item[1]))

# 针对单价和总价进行画图
keys = list(unit_price_list.keys())
unit_values = [value[0] for value in unit_price_list.values()]

#显示中文字符
plt.rcParams['font.sans-serif'] = ['SimHei']  # 'SimHei'是一种常用的中文字体window用这个  macOS用这个Arial Unicode MS
plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号

# 创建水平柱状图
# 计算需要的画布高度
number_of_items = len(unit_price_list)
height_per_item = 0.2  # 每个项目的高度，可以根据需要调整
fig_height = number_of_items * height_per_item

fig1, ax1 = plt.subplots(figsize=(8,  max(5, fig_height)))

ax1.barh(keys, unit_values, color='blue')
ax1.set_title("桂林各个小区单价")
ax1.set_ylabel("小区名称")
ax1.set_xlabel("小区单价（单位：元/平方米）")
fig1.savefig('unit_price_list.png')


total_values = [value[1] for value in unit_price_list.values()]
fig2, ax2 = plt.subplots(figsize=(8,  max(5, fig_height)))
ax2.barh(keys, unit_values, color='blue')
ax2.set_title("桂林各个小区总价")
ax2.set_ylabel("小区名称")
ax2.set_xlabel("小区单价（单位：万元/套）")
fig2.savefig('total_price_list.png')

regions_value =list(regions_averageSorted_list.values())
regions_keys = list(regions_averageSorted_list.keys())
number_of_regions = len(regions_averageSorted_list)
regions_height = number_of_items * height_per_item
fig3, ax3 = plt.subplots(figsize=(8,  max(5, fig_height)))
ax3.barh(regions_keys, regions_value, color='orange')
ax3.set_title("桂林各个区单价情况")
ax3.set_ylabel("区名称")
ax3.set_xlabel("小区单价（单位：元/平方米）")
fig3.savefig('regions_price_list.png')

plt.show()