加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
爬取ip.py 2.11 KB
一键复制 编辑 原始数据 按行查看 历史
wu 提交于 2021-07-28 02:57 . 爬取并搭建简易ip代理池
import parsel
import requests
import time
def chik_ip(proxies_list):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
can_ip=[]
for proxies in proxies_list:
try:
respon=requests.get(url='http://www.baidu.com',headers=headers,proxies={'http://':proxies},timeout=1)
if respon.status_code==200:
print(f"{proxies}可用")
can_ip.append(proxies)
except Exception as e:
return False
return can_ip
def ipinfo():
proxies_list = []
start_time=time.time()
for i in range(6):
page=i+1
print(f"正在爬取第{page}{'#*#'*30}")
url=f'https://www.kuaidaili.com/free/inha/{page}/'
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
res=requests.get(url,headers=headers)
data = res.text
html = parsel.Selector(data) # 实例化
resp = html.xpath('//*[@id="list"]/table/tbody/tr')
time.sleep(0.5)
for i in resp:
proxies_dict={}
http_type = i.xpath('./td[4]/text()').extract_first()
ip_num = i.xpath('./td[1]/text()').extract_first()
port_num = i.xpath('./td[2]/text()').extract_first()
proxies_dict[http_type]=ip_num+':'+port_num
print(proxies_dict)
time.sleep(0.5)
proxies_list.append(proxies_dict[http_type])
print(f"第{page}爬取完毕{'*+*'*30}")
print(proxies_list,)
print('获取到的代理IP数量是:',len(proxies_list),"个")
end_time=time.time()
now_time=end_time-start_time
print(f"用时{now_time}秒")
return proxies_list
def write_text(can_ip):
w = open('./i_p.text', 'w')
for i, item in enumerate(can_ip):
w.write(item + '\n')
w.close()
def run():
s_ip=ipinfo()
c_ip=chik_ip(s_ip)
write_text(c_ip)
run()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化