master

分支 (1)

管理

管理

master

copywebsite
/
http.py

# -*- coding: UTF-8 -*-
import urllib2
import re
import os,itertools
import urlparse
class http:
	contents = ''  #页面内容
	website = ''  #目标域名
	webpath = ''  #保存的目录
	allurl=[];    #保存需要爬行的link
	basefiles=[]; #已经生成文件，保存的文件路径
	currDomain = ''#当然主机域名
	notFinedUrl=[] #404页面的路径
	def __init__(self,url):
		self.website = url;
		_parse=urlparse.urlparse(self.website);
		self.webpath = _parse.netloc
		self.makedirs(_parse.netloc);
		self.currDomain = _parse.scheme+"://"+_parse.netloc #当前域内
	def curl_macth(self,url='',_index=0):
		_url = url;
		if _url=='':
			_url = self.website;
		exits = ["png","jpg","gif","jpng"]
		ext = self.get_file_ext(_url)

		print "Action:"+_url
		if exits.count(ext.lower()) == 0:
			#判断连接是否能打开
			htmlContent = self.getHtmtContent(_url);
			if htmlContent != False:
				pattern = re.compile('<a.*?href=[\'|\"](.*?)[\'|\"].*?>')
				macth = pattern.findall(htmlContent)
				_path = self.get_new_url(_url);
				htmlContent = self.actionAUrl(macth,htmlContent,_path)
				if _path[0] == "/" : _path = _path[1:len(_path)];
				_pathfile=self.path_join(self.webpath,_path);
				#文件不存在的情况下才生成
				if self.isFiles(_pathfile) == False:
					#目录不存在的时候就生成目录
					if self.isDirs(os.path.split(_pathfile)[0])==False:
						self.makedirs(os.path.split(_pathfile)[0])
					#获取到style
					htmlContent = self.macthStaticFile(htmlContent,self.currDomain,_url);
					#生成文件
					self.makefile(_pathfile,htmlContent);
		else:
			self.loading_img(_url)
		#循环当然页面下的路径
		if _index < len(self.allurl):
			curlparse = self.allurl[_index]; #得到一个路径
			subparse = urlparse.urlparse(self.allurl[_index]);
			if subparse.netloc == '':
				if curlparse[0] == "/" : curlparse = curlparse[1:len(curlparse)];
				curlparse = self.path_join(self.currDomain,curlparse)
			#继续下一个页面
			self.curl_macth(curlparse,_index+1)
		pass
	def actionAUrl(self,urlLists,contents,curl):
		if(len(urlLists)>0):
			urlLists.sort(key=lambda x:len(x),reverse=True) #排序
			for index in range(len(urlLists)):
				_cparse=urlparse.urlparse(urlLists[index])
				#判断是否为当前的域名Link
				if _cparse.netloc != '' and self.webpath != _cparse.netloc:
					continue
				#判断是否为http|https协议
				if _cparse.scheme != '' and (_cparse.scheme != 'http' and _cparse.scheme != "https"):
					continue
				if _cparse.netloc == '' and _cparse.path=='':
					continue
				if self.allurl.count(urlLists[index]) > 0:
					continue
				self.allurl.append(urlLists[index])
				#替换路径
				relativeURL = self.getRelpath(curl,self.get_new_url(urlLists[index]),"/") #取得相对路径
				contents = re.sub('href=[\'|\"]'+urlLists[index]+'[\'|\"]', "href=\""+relativeURL+"\"",contents);
				#print urlLists[index];
		return contents;
	#匹配css文件
	#把远程css文件下载到本地
	def macthStaticFile(self,contents,_chttphost,_url):
		csspattern = re.compile('<link.*?href=[\'|\"](.*?)[\'|\"].*?>')
		cssmacth = csspattern.findall(contents)
		if cssmacth == "" : cssmacth=[]
		jspattern = re.compile('<script.*?src=[\'|\"](.*?)[\'|\"].*?>')
		jsmacth = jspattern.findall(contents)
		if jsmacth == "" : jsmacth=[]
		macth = cssmacth+jsmacth  #集合合并
		exits = ["css","js"]
		if(len(macth)>0):
			for index in range(len(macth)):
				newstylepath = self.createStaticFile(_chttphost,macth[index],_url);
				if newstylepath != False:
					contents = contents.replace(macth[index],newstylepath)
		contents = self.macth_image_file(contents,_chttphost,_url);
		return contents
	#生成静态文件到本地
	def createStaticFile(self,_chttphost,staticUrl,_url):
		exits = ["css","js"]
		_parse=urlparse.urlparse(staticUrl);
		#判断是否同域内
		if _parse.netloc != '' and _parse.netloc != self.webpath:return False;
		exit = self.get_file_ext(_parse.path)
		if exits.count(exit.lower()) == 0 : return False;  #文件类型不对
		sfile = os.path.split(_parse.path)
		styleName = sfile[1];
		stylepath = sfile[0];
		stylepath = stylepath.replace("../","");
		stylepath = stylepath.replace("./","");
		if stylepath[0] == "/" : stylepath = stylepath[1:len(stylepath)];
		_path = self.path_join(self.webpath,stylepath,styleName);
		_rangeurl = staticUrl;
		if _parse.netloc == '':
			_rangeurl = self.getRealpath(_chttphost+"/"+_parse.path)
		#目录不存在的时候就生成目录
		if self.isDirs(self.path_join(self.webpath,stylepath))==False:
			self.makedirs(self.path_join(self.webpath,stylepath))
		if self.isFiles(self.path_join(self.webpath,stylepath)) == False:
			Contents = self.getHtmtContent(_rangeurl);
			if Contents == False:return False;
			if Contents != '':
				self.makefile(_path,Contents);
				#获取css文件里面的其他图片和css文件
				if exit == "css":
					self.getResouceList(_rangeurl,Contents)
				#取得新的相对路径
				if _url != "":
					return self.action_resouce_path(self.path_join(stylepath,styleName),_url)
		return False;
		pass
	#匹配图片文件
	#把页面图片下载到本地
	def macth_image_file(self,contents,_chttphost,_url):
		pattern = re.compile('<img.*?src=[\'|\"](.*?)[\'|\"].*?>') #
		macth = pattern.findall(contents)
		exits = ["png","jpg","gif","jpng"]
		if(len(macth)>0):
			for index in range(len(macth)):
				imgurl = macth[index] #远程路径
				_parse=urlparse.urlparse(imgurl);
				if _parse.netloc != '' and _parse.netloc != self.webpath :continue  #如果不是一个域内就跳出
				exit = self.get_file_ext(_parse.path)
				if exits.count(exit.lower()) == 0 : continue
				sfile = os.path.split(_parse.path)
				imgName = sfile[1];
				_path = sfile[0]  #图片远程的路径
				_path = _path.replace("../","");
				_path = _path.replace("./","");
				imgpath = self.path_join(self.webpath,_path,imgName);
				if _parse.netloc == '':
					imgurl = self.getRealpath(_chttphost+"/"+_parse.path)
				#检测img文件是否存在，如果存在就不从新生成
				loadStatsu = self.loading_img(imgurl);
				if loadStatsu == False:continue;
				newimgpath = self.action_resouce_path(self.path_join(_path,imgName),_url) #取得新的相对路径
				contents = contents.replace(macth[index],newimgpath)
		return contents
		pass

	def getHtmtContent(self,url):
		if self.notFinedUrl.count(url): return False; #文件已经处理过了
		try:
			headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
			req = urllib2.Request(url = url,headers = headers)
			page = urllib2.urlopen(req)
			html = page.read()
			return html
		except urllib2.URLError,e:
			self.notFinedUrl.append(url)
			if hasattr(e,'reason'): print url+' reason:{0}'.format(e.reason)
			if hasattr(e,'code'): print url+' code:{0}'.format(e.code)
			return False
	#生成目录
	def makedirs(self,name):
		path = self.path_join("data",name);
		if os.path.isdir(path) == False:
			os.makedirs(path)
		pass
	#下载远程图片
	def loading_img(self,url):
		if self.notFinedUrl.count(url): return False; #文件已经处理过了
		_parse=urlparse.urlparse(url);
		_path = _parse.path
		if _path[0] == "/" : _path = _path[1:len(_path)];
		Filename = self.path_join(self.webpath,_path); #合并目录和文件
		#文件已经存在就不生成啦
		if self.isFiles(Filename) == True:return False;
		#创建目录
		self.makedirs(os.path.dirname(Filename));
		try:
			headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
			req = urllib2.Request(url = url,headers = headers)
			img = urllib2.urlopen(req);
			with open(self.path_join("data",Filename), 'wb') as localFile:
				localFile.write(img.read())
			return True
			pass
		except urllib2.URLError,e:
			self.notFinedUrl.append(url)
			if hasattr(e,'reason'): print 'reason:{0}'.format(e.reason)
			if hasattr(e,'code'): print url+' code:{0}'.format(e.code)
			return False
	#生成文件
	def makefile(self,filename,contents):
		Filenames = self.path_join("data",filename);
		if os.path.isfile(Filenames) == False:
			self.basefiles.append(Filenames);
			fh = open(Filenames, 'wb')
			fh.write(contents)
			fh.close()
	#取得文件的后缀
	def get_file_ext(self,filename):
		ext = os.path.splitext(filename)[1];
		return ext[1:len(ext)]
		pass
	#根据路径生成新的本地路径
	def get_new_url(self,oldurl):
		pathArr=urlparse.urlparse(oldurl);
		_paths = os.path.split(pathArr.path);  #取得文件相对路径
		fileName = _paths[1]
		filePath = _paths[0]
		if fileName == '': fileName="index";  #目录下面默认一个文件名称
		else : fileName = os.path.splitext(fileName)[0];  #取得没有文件后缀的名称
		subName = pathArr.query;
		if subName != '':
			subName = subName.replace("=","");
			subName = subName.replace("&amp;","-");
			fileName = fileName+"-"+subName.replace("&","-");

		if pathArr.fragment != '':fileName+"-"+pathArr.fragment;
		fileName = fileName+".html"
		#处理目录结构
		if filePath == '' or filePath=="/":return fileName;
		else: return self.path_join(filePath,fileName)
		pass
	#处理资源路径css|img|js
	def action_resouce_path(self,filename,path):
		pathArr = urlparse.urlparse(path) #分析路径
		_paths = os.path.split(pathArr.path);  #取得文件相对路径
		path = _paths[0]
		if filename[0] == "/" : filename = filename[1:len(filename)]
		#如果是第一级目录就直接返回
		if path == "/" or path == "":return filename;
		replace_reg = re.compile(r'(\w+)/')  #转换成相对路径
		newPath = replace_reg.sub('../',path+"/");
		if newPath[0] == "/":
			return newPath[1:len(newPath)]+filename

		return self.path_join(newPath,filename)
		pass
	#获取目标网站的协议和域名
	def get_domain_host(self,url):
		_parse = urlparse.urlsplit(url)
		return _parse.scheme+"://"+_parse.netloc
		pass
	#获取css内容里面的图片和导入的css或font文件
	def getResouceList(self,url,htmlContent=""):
		if htmlContent == "" : htmlContent = self.getHtmtContent(url);
		if htmlContent != False:
			exits = ["png","jpg","gif","jpng","eot","ttf","woff","svg"]
			domain_host = self.get_domain_host(url);
			pattern = re.compile('url\((.*?)\)')
			macth = pattern.findall(htmlContent)
			rangePath = os.path.dirname(url);
			if len(macth)>0:
				for index in range(len(macth)):
					resouceURL = macth[index];
					resouceURL = resouceURL.replace("\"","");
					resouceURL = resouceURL.replace("\'","");
					parse = urlparse.urlparse(resouceURL)
					ext = self.get_file_ext(parse.path) #获取到文件类型
					#不允许的类型文件
					if exits.count(ext) == 0 and ext != "css":continue;
					if resouceURL[0] == "/":
						resouceURL=domain_host+parse.path
					else:
						resouceURL = os.path.join(rangePath,parse.path); #并接目录和文件
						#返回文件的真实路径
						resouceURL = self.getRealpath(resouceURL)
					#css文件的时候
					if ext == "css":
						self.createStaticFile(domain_host,resouceURL,"");
						pass
					if exits.count(ext)>0:
						self.loading_img(resouceURL)
						pass
		pass
	#合并两个目录
	def path_join(self,path1,path2=None,path3=None,path4=None):
		if path2[0] =="/" : path2 = path2[1:len(path2)]
		path = os.path.join(path1,path2);
		if path3 != None:
			if path3[0] =="/" : path3 = path3[1:len(path3)]
			path = os.path.join(path1,path2,path3);
		if path4 !=None:
			if path4[0] =="/" : path4 = path4[1:len(path4)]
			path = os.path.join(path1,path2,path3,path4);
		return path.replace("\\","/");
		pass
	#判断是否为文件存在
	def isFiles(self,filename):
		Fileame = self.path_join("data",filename)
		if os.path.isfile(Fileame) == False:
			return False;
		else: return True;
	#判断目录是否存在
	def isDirs(self,dir):
		Dirname = self.path_join("data",dir);
		if os.path.isdir(Dirname) == True:return True;
		else: return False;

	#获取到文件真实路径
	def getRealpath(self,path):
		abspath = os.path.abspath("./")+"\\" #当前根目录
		resouceURL = os.path.realpath(path); #返回文件的真实路径
		resouceURL = resouceURL.replace(abspath,"");
		resouceURL = resouceURL.replace("\\","/");
		resouceURL = resouceURL.replace("://","####");
		resouceURL = resouceURL.replace(":/","####");
		resouceURL = resouceURL.replace("####","://");
		return resouceURL;
		pass
	#取得相对路进
	def all_equal(self,elements):
		first_element = elements[0]
		for other_element in elements[1:]:
			if other_element != first_element : return False
		return True

	def common_prefix(self,*sequences):
		if not sequences: return[],[]
		common = []
		for elements in itertools.izip(*sequences):
			if not self.all_equal(elements):break
			common.append(elements[0])
		return common,[sequence[len(common):] for sequence in sequences]
		pass
	#getRelpath('www.jinyinzs.com/detail/id/2.html','www.jinyinzs.com/cms/1.html','/')
	def getRelpath(self,p1,p2, sep=os.path.sep, pardir=os.path.pardir):
		p1=self.path_join(self.webpath,p1)
		p2=self.path_join(self.webpath,p2)
		common,(u1,u2) = self.common_prefix(p1.split(sep),p2.split(sep))
		if not common:
			return p2
		return sep.join([pardir] * len(u1) + u2)