加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
seafishes_crawler.py 1.65 KB
一键复制 编辑 原始数据 按行查看 历史
maudyi 提交于 2020-06-16 00:08 . 加上了sqlite持久化。
# -*- coding: utf-8 -*-
import re
import urllib
import urllib.request
from string import Template
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from scrapy import cmdline
from datetime import datetime
import time
import os
from scrapy.cmdline import execute
import sys
from apscheduler.triggers.cron import CronTrigger
from apscheduler.schedulers.background import BackgroundScheduler
def download(url):
"""Simple downloader"""
return urllib.request.urlopen(url).read()
def crawl_sitemap():
# download the sitemap file
str1 = Template('https://www.fmo.org.hk/price?id=8&path=12_43_56&page=${pageNum}&per-page=10')
siteUrl = str1.substitute(pageNum='1')
print(siteUrl)
sitemap = download(siteUrl)
rests = str(sitemap, encoding="utf-8")
print(rests)
def job_function():
print('running crawler task, getting data from HongKong...... ')
# ValueError: signal only works in main thread
# cmdline.execute("scrapy crawl hongkong --nolog".split())
# execute(["scrapy", "crawl", "hongkong"])
os.system('scrapy crawl hongkong --nolog')
if __name__ == '__main__':
# crawl_sitemap()
scheduler = BackgroundScheduler(
jobstores={'sqlite': SQLAlchemyJobStore(url='sqlite:////root/crawler.db')},
job_defaults={'misfire_grace_time': 15 * 60},
)
scheduler.add_job(job_function, CronTrigger.from_crontab('10 22 * * *'))
scheduler.start()
print('Crawler of HongKong has been started by apscheduler.')
try:
while True:
# print('========scheduler has been started...')
time.sleep(5)
except (KeyboardInterrupt, SystemExit):
scheduler.shutdown()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化