diff --git a/getlinksapp/function.py b/getlinksapp/function.py
index 52d1146c6569bb58e3fde1d3f00d525cbb56c42f..49472dbd4e613654df1a112367bb7bcc15669f7c 100644
--- a/getlinksapp/function.py
+++ b/getlinksapp/function.py
@@ -149,21 +149,30 @@ def HandleandSave(find_url, domain, url):
getLinks(link, domain=domain) # 域内未收录链接创建任务继续迭代
print(link, '下探扫描完成!')
-
+def VerifyUrl(url)->bool:
+ black_list=r"\.mov|\.mkv|\.avi|\.mp4|\.mp3|\.f4v|\.asf|\.wmv|\.mpeg|\.exe|\.doc|\.docx|\.pdf|\.xlsx|\.xls|\.ppt|\.pptx|\.run|\.rpm|\.deb|\.msi|\.iso|\.zip|\.7z|\.rar|\.tar|\.tar.gz|\.png|\.jpeg|\.jpg|\.svg|\.gif|\.ttf|\.otf|\.woff|\.dtd"
+ pattern = re.compile(black_list)
+ find_temp = re.findall(pattern, url)
+ print(find_temp)
+ if len(find_temp)>0:
+ return False
+ else:
+ return True
def getLinks(url, domain):
# 获取链接的内容
- print('[...]正在获取', url, '的网页内容')
- res_code, res_content = getRes(url)
- if res_code != 0 and res_content != None:
- # 1.正则匹配文章中的链接并做检查
- find_url = getLinks_by_re(res_content)
- HandleandSave(find_url, domain, url)
-
- # 2.排查标签中的链接并作检查
- find_url = getLinks_by_soup(res_content, url)
- HandleandSave(find_url, domain, url)
- else:
- HandleandSave('', domain, url)
+ if VerifyUrl(url):
+ print('[...]正在获取', url, '的网页内容')
+ res_code, res_content = getRes(url)
+ if res_code != 0 and res_content != None:
+ # 1.正则匹配文章中的链接并做检查
+ find_url = getLinks_by_re(res_content)
+ HandleandSave(find_url, domain, url)
+
+ # 2.排查标签中的链接并作检查
+ find_url = getLinks_by_soup(res_content, url)
+ HandleandSave(find_url, domain, url)
+ else:
+ HandleandSave('', domain, url)
class ExcelImport: