加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
http.py 12.96 KB
一键复制 编辑 原始数据 按行查看 历史
高小姐 提交于 2017-02-15 16:21 . 网站爬虫工具v1.0
# -*- coding: UTF-8 -*-
import urllib2
import re
import os,itertools
import urlparse
class http:
contents = '' #页面内容
website = '' #目标域名
webpath = '' #保存的目录
allurl=[]; #保存需要爬行的link
basefiles=[]; #已经生成文件,保存的文件路径
currDomain = ''#当然主机域名
notFinedUrl=[] #404页面的路径
def __init__(self,url):
self.website = url;
_parse=urlparse.urlparse(self.website);
self.webpath = _parse.netloc
self.makedirs(_parse.netloc);
self.currDomain = _parse.scheme+"://"+_parse.netloc #当前域内
def curl_macth(self,url='',_index=0):
_url = url;
if _url=='':
_url = self.website;
exits = ["png","jpg","gif","jpng"]
ext = self.get_file_ext(_url)
print "Action:"+_url
if exits.count(ext.lower()) == 0:
#判断连接是否能打开
htmlContent = self.getHtmtContent(_url);
if htmlContent != False:
pattern = re.compile('<a.*?href=[\'|\"](.*?)[\'|\"].*?>')
macth = pattern.findall(htmlContent)
_path = self.get_new_url(_url);
htmlContent = self.actionAUrl(macth,htmlContent,_path)
if _path[0] == "/" : _path = _path[1:len(_path)];
_pathfile=self.path_join(self.webpath,_path);
#文件不存在的情况下才生成
if self.isFiles(_pathfile) == False:
#目录不存在的时候就生成目录
if self.isDirs(os.path.split(_pathfile)[0])==False:
self.makedirs(os.path.split(_pathfile)[0])
#获取到style
htmlContent = self.macthStaticFile(htmlContent,self.currDomain,_url);
#生成文件
self.makefile(_pathfile,htmlContent);
else:
self.loading_img(_url)
#循环当然页面下的路径
if _index < len(self.allurl):
curlparse = self.allurl[_index]; #得到一个路径
subparse = urlparse.urlparse(self.allurl[_index]);
if subparse.netloc == '':
if curlparse[0] == "/" : curlparse = curlparse[1:len(curlparse)];
curlparse = self.path_join(self.currDomain,curlparse)
#继续下一个页面
self.curl_macth(curlparse,_index+1)
pass
def actionAUrl(self,urlLists,contents,curl):
if(len(urlLists)>0):
urlLists.sort(key=lambda x:len(x),reverse=True) #排序
for index in range(len(urlLists)):
_cparse=urlparse.urlparse(urlLists[index])
#判断是否为当前的域名Link
if _cparse.netloc != '' and self.webpath != _cparse.netloc:
continue
#判断是否为http|https协议
if _cparse.scheme != '' and (_cparse.scheme != 'http' and _cparse.scheme != "https"):
continue
if _cparse.netloc == '' and _cparse.path=='':
continue
if self.allurl.count(urlLists[index]) > 0:
continue
self.allurl.append(urlLists[index])
#替换路径
relativeURL = self.getRelpath(curl,self.get_new_url(urlLists[index]),"/") #取得相对路径
contents = re.sub('href=[\'|\"]'+urlLists[index]+'[\'|\"]', "href=\""+relativeURL+"\"",contents);
#print urlLists[index];
return contents;
#匹配css文件
#把远程css文件下载到本地
def macthStaticFile(self,contents,_chttphost,_url):
csspattern = re.compile('<link.*?href=[\'|\"](.*?)[\'|\"].*?>')
cssmacth = csspattern.findall(contents)
if cssmacth == "" : cssmacth=[]
jspattern = re.compile('<script.*?src=[\'|\"](.*?)[\'|\"].*?>')
jsmacth = jspattern.findall(contents)
if jsmacth == "" : jsmacth=[]
macth = cssmacth+jsmacth #集合合并
exits = ["css","js"]
if(len(macth)>0):
for index in range(len(macth)):
newstylepath = self.createStaticFile(_chttphost,macth[index],_url);
if newstylepath != False:
contents = contents.replace(macth[index],newstylepath)
contents = self.macth_image_file(contents,_chttphost,_url);
return contents
#生成静态文件到本地
def createStaticFile(self,_chttphost,staticUrl,_url):
exits = ["css","js"]
_parse=urlparse.urlparse(staticUrl);
#判断是否同域内
if _parse.netloc != '' and _parse.netloc != self.webpath:return False;
exit = self.get_file_ext(_parse.path)
if exits.count(exit.lower()) == 0 : return False; #文件类型不对
sfile = os.path.split(_parse.path)
styleName = sfile[1];
stylepath = sfile[0];
stylepath = stylepath.replace("../","");
stylepath = stylepath.replace("./","");
if stylepath[0] == "/" : stylepath = stylepath[1:len(stylepath)];
_path = self.path_join(self.webpath,stylepath,styleName);
_rangeurl = staticUrl;
if _parse.netloc == '':
_rangeurl = self.getRealpath(_chttphost+"/"+_parse.path)
#目录不存在的时候就生成目录
if self.isDirs(self.path_join(self.webpath,stylepath))==False:
self.makedirs(self.path_join(self.webpath,stylepath))
if self.isFiles(self.path_join(self.webpath,stylepath)) == False:
Contents = self.getHtmtContent(_rangeurl);
if Contents == False:return False;
if Contents != '':
self.makefile(_path,Contents);
#获取css文件里面的其他图片和css文件
if exit == "css":
self.getResouceList(_rangeurl,Contents)
#取得新的相对路径
if _url != "":
return self.action_resouce_path(self.path_join(stylepath,styleName),_url)
return False;
pass
#匹配图片文件
#把页面图片下载到本地
def macth_image_file(self,contents,_chttphost,_url):
pattern = re.compile('<img.*?src=[\'|\"](.*?)[\'|\"].*?>') #
macth = pattern.findall(contents)
exits = ["png","jpg","gif","jpng"]
if(len(macth)>0):
for index in range(len(macth)):
imgurl = macth[index] #远程路径
_parse=urlparse.urlparse(imgurl);
if _parse.netloc != '' and _parse.netloc != self.webpath :continue #如果不是一个域内就跳出
exit = self.get_file_ext(_parse.path)
if exits.count(exit.lower()) == 0 : continue
sfile = os.path.split(_parse.path)
imgName = sfile[1];
_path = sfile[0] #图片远程的路径
_path = _path.replace("../","");
_path = _path.replace("./","");
imgpath = self.path_join(self.webpath,_path,imgName);
if _parse.netloc == '':
imgurl = self.getRealpath(_chttphost+"/"+_parse.path)
#检测img文件是否存在,如果存在就不从新生成
loadStatsu = self.loading_img(imgurl);
if loadStatsu == False:continue;
newimgpath = self.action_resouce_path(self.path_join(_path,imgName),_url) #取得新的相对路径
contents = contents.replace(macth[index],newimgpath)
return contents
pass
def getHtmtContent(self,url):
if self.notFinedUrl.count(url): return False; #文件已经处理过了
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url = url,headers = headers)
page = urllib2.urlopen(req)
html = page.read()
return html
except urllib2.URLError,e:
self.notFinedUrl.append(url)
if hasattr(e,'reason'): print url+' reason:{0}'.format(e.reason)
if hasattr(e,'code'): print url+' code:{0}'.format(e.code)
return False
#生成目录
def makedirs(self,name):
path = self.path_join("data",name);
if os.path.isdir(path) == False:
os.makedirs(path)
pass
#下载远程图片
def loading_img(self,url):
if self.notFinedUrl.count(url): return False; #文件已经处理过了
_parse=urlparse.urlparse(url);
_path = _parse.path
if _path[0] == "/" : _path = _path[1:len(_path)];
Filename = self.path_join(self.webpath,_path); #合并目录和文件
#文件已经存在就不生成啦
if self.isFiles(Filename) == True:return False;
#创建目录
self.makedirs(os.path.dirname(Filename));
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url = url,headers = headers)
img = urllib2.urlopen(req);
with open(self.path_join("data",Filename), 'wb') as localFile:
localFile.write(img.read())
return True
pass
except urllib2.URLError,e:
self.notFinedUrl.append(url)
if hasattr(e,'reason'): print 'reason:{0}'.format(e.reason)
if hasattr(e,'code'): print url+' code:{0}'.format(e.code)
return False
#生成文件
def makefile(self,filename,contents):
Filenames = self.path_join("data",filename);
if os.path.isfile(Filenames) == False:
self.basefiles.append(Filenames);
fh = open(Filenames, 'wb')
fh.write(contents)
fh.close()
#取得文件的后缀
def get_file_ext(self,filename):
ext = os.path.splitext(filename)[1];
return ext[1:len(ext)]
pass
#根据路径生成新的本地路径
def get_new_url(self,oldurl):
pathArr=urlparse.urlparse(oldurl);
_paths = os.path.split(pathArr.path); #取得文件相对路径
fileName = _paths[1]
filePath = _paths[0]
if fileName == '': fileName="index"; #目录下面默认一个文件名称
else : fileName = os.path.splitext(fileName)[0]; #取得没有文件后缀的名称
subName = pathArr.query;
if subName != '':
subName = subName.replace("=","");
subName = subName.replace("&amp;","-");
fileName = fileName+"-"+subName.replace("&","-");
if pathArr.fragment != '':fileName+"-"+pathArr.fragment;
fileName = fileName+".html"
#处理目录结构
if filePath == '' or filePath=="/":return fileName;
else: return self.path_join(filePath,fileName)
pass
#处理资源路径css|img|js
def action_resouce_path(self,filename,path):
pathArr = urlparse.urlparse(path) #分析路径
_paths = os.path.split(pathArr.path); #取得文件相对路径
path = _paths[0]
if filename[0] == "/" : filename = filename[1:len(filename)]
#如果是第一级目录就直接返回
if path == "/" or path == "":return filename;
replace_reg = re.compile(r'(\w+)/') #转换成相对路径
newPath = replace_reg.sub('../',path+"/");
if newPath[0] == "/":
return newPath[1:len(newPath)]+filename
return self.path_join(newPath,filename)
pass
#获取目标网站的协议和域名
def get_domain_host(self,url):
_parse = urlparse.urlsplit(url)
return _parse.scheme+"://"+_parse.netloc
pass
#获取css内容里面的图片和导入的css或font文件
def getResouceList(self,url,htmlContent=""):
if htmlContent == "" : htmlContent = self.getHtmtContent(url);
if htmlContent != False:
exits = ["png","jpg","gif","jpng","eot","ttf","woff","svg"]
domain_host = self.get_domain_host(url);
pattern = re.compile('url\((.*?)\)')
macth = pattern.findall(htmlContent)
rangePath = os.path.dirname(url);
if len(macth)>0:
for index in range(len(macth)):
resouceURL = macth[index];
resouceURL = resouceURL.replace("\"","");
resouceURL = resouceURL.replace("\'","");
parse = urlparse.urlparse(resouceURL)
ext = self.get_file_ext(parse.path) #获取到文件类型
#不允许的类型文件
if exits.count(ext) == 0 and ext != "css":continue;
if resouceURL[0] == "/":
resouceURL=domain_host+parse.path
else:
resouceURL = os.path.join(rangePath,parse.path); #并接目录和文件
#返回文件的真实路径
resouceURL = self.getRealpath(resouceURL)
#css文件的时候
if ext == "css":
self.createStaticFile(domain_host,resouceURL,"");
pass
if exits.count(ext)>0:
self.loading_img(resouceURL)
pass
pass
#合并两个目录
def path_join(self,path1,path2=None,path3=None,path4=None):
if path2[0] =="/" : path2 = path2[1:len(path2)]
path = os.path.join(path1,path2);
if path3 != None:
if path3[0] =="/" : path3 = path3[1:len(path3)]
path = os.path.join(path1,path2,path3);
if path4 !=None:
if path4[0] =="/" : path4 = path4[1:len(path4)]
path = os.path.join(path1,path2,path3,path4);
return path.replace("\\","/");
pass
#判断是否为文件存在
def isFiles(self,filename):
Fileame = self.path_join("data",filename)
if os.path.isfile(Fileame) == False:
return False;
else: return True;
#判断目录是否存在
def isDirs(self,dir):
Dirname = self.path_join("data",dir);
if os.path.isdir(Dirname) == True:return True;
else: return False;
#获取到文件真实路径
def getRealpath(self,path):
abspath = os.path.abspath("./")+"\\" #当前根目录
resouceURL = os.path.realpath(path); #返回文件的真实路径
resouceURL = resouceURL.replace(abspath,"");
resouceURL = resouceURL.replace("\\","/");
resouceURL = resouceURL.replace("://","####");
resouceURL = resouceURL.replace(":/","####");
resouceURL = resouceURL.replace("####","://");
return resouceURL;
pass
#取得相对路进
def all_equal(self,elements):
first_element = elements[0]
for other_element in elements[1:]:
if other_element != first_element : return False
return True
def common_prefix(self,*sequences):
if not sequences: return[],[]
common = []
for elements in itertools.izip(*sequences):
if not self.all_equal(elements):break
common.append(elements[0])
return common,[sequence[len(common):] for sequence in sequences]
pass
#getRelpath('www.jinyinzs.com/detail/id/2.html','www.jinyinzs.com/cms/1.html','/')
def getRelpath(self,p1,p2, sep=os.path.sep, pardir=os.path.pardir):
p1=self.path_join(self.webpath,p1)
p2=self.path_join(self.webpath,p2)
common,(u1,u2) = self.common_prefix(p1.split(sep),p2.split(sep))
if not common:
return p2
return sep.join([pardir] * len(u1) + u2)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化