From 0d29a967aec32e2c391a16f7ca6ec7bf75b1a632 Mon Sep 17 00:00:00 2001 From: oyzy Date: Mon, 25 Sep 2023 14:21:17 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=A5=87=E5=BF=AB?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E7=BD=91=20common=E5=BC=95=E5=85=A5iconv?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/common.js | 10 ++-- src/crawler.13.js | 123 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 src/crawler.13.js diff --git a/src/common.js b/src/common.js index da902ed..43e3ba7 100644 --- a/src/common.js +++ b/src/common.js @@ -1,8 +1,10 @@ -import { load, he, randomUserAgent, randomPCUserAgent, randomMobileUserAgent, htmlToString, removeBasedOnSeparator } from './base_common' +import { load, he, randomUserAgent, randomPCUserAgent, randomMobileUserAgent, htmlToString, removeBasedOnSeparator } from './base_common.js' + +import iconv from 'iconv-lite'; const supported_encodings = ['utf-8', 'utf8', 'unicode-1-1-utf-8', 'gbk', 'gb2312', 'gb-2312', 'CP936', 'MS936']; -function checkEncoding(encoding) { +function checkEncoding (encoding) { if (!supported_encodings.includes(encoding.toLowerCase())) { throw new Error(`Encoding ${encoding} is not supported.`); } @@ -11,7 +13,7 @@ function checkEncoding(encoding) { /** * 发送请求 */ -async function request(url, options) { +async function request (url, options) { options ??= {}; console.log("发送请求: ", url, options.body ?? ''); @@ -62,7 +64,7 @@ async function request(url, options) { /** * 把字符串转为urlEncode编码 */ -function encodeUrl(inputString, encoding) { +function encodeUrl (inputString, encoding) { const encodedBuffer = iconv.encode(inputString, encoding); let urlEncodedString = ''; for (const byte of encodedBuffer) { diff --git a/src/crawler.13.js b/src/crawler.13.js new file mode 100644 index 0000000..92ffc70 --- /dev/null +++ b/src/crawler.13.js @@ -0,0 +1,123 @@ +import { load, request, htmlToString } from './common.js' + +const source = { + id: "free_book_js/13", + name: "奇快中文网", + portalUrl: "https://www.qikuaizw.cc", + searchBaseUrl: "https://www.qikuaizw.cc", + bookBaseUrl: "https://www.qikuaizw.cc", + active: true +}; + + +async function searchBook (keyword, pageNum) { + function getBoolList ($) { + const bookArr = []; + if ($('.zjlist').length > 0) {//存在目录 + bookArr.push({ + name: $("meta[property='og:novel:book_name']").attr('content'), + author: $("meta[property='og:novel:author']").attr('content'), + path: $("meta[property='og:novel:read_url']").attr('content').replace(source.bookBaseUrl, ''), + sourceId: source.id + }); + } else { + const alistboxArr = $("#main tr"); + alistboxArr.each(function (index, elem) { + if (index != 0) { + let tds = $(this).find("td"); + let nameAndPath = tds.eq(0).find("a"); + bookArr.push({ + name: nameAndPath.text(), + author: tds.eq(2).text(), + path: nameAndPath.attr('href').replace(source.bookBaseUrl, ''), + sourceId: source.id + }); + } + }) + } + return bookArr; + } + + let url = source.searchBaseUrl + '/modules/article/search.php?searchkey=' + keyword; + var html = await (await request(url, { urlEncode: 'gb2312', decode: 'gbk' })).text(); + var doc = load(html); + var bookArr = getBoolList(doc); + + return { + size: bookArr.length, + pageNum: 1, + pages: 1, + sourceId: source.id, + keyword: keyword, + bookList: bookArr + }; +} + +async function bookDetails (path) { + var html = await (await request(source.bookBaseUrl + path, { decode: 'gbk' })).text(); + let $ = load(html); + + return { + name: $("meta[property='og:novel:book_name']").attr('content'), + author: $("meta[property='og:novel:author']").attr('content'), + path: path, + sourceId: source.id, + intro: $("meta[property='og:description']").attr('content'), + coverUrl: $("meta[property='og:image']").attr('content'), + type: $("meta[property='og:novel:category']").attr('content'), + }; +} + +async function bookCatalog (path) { + var html = await (await request(source.bookBaseUrl + path, { decode: 'gbk' })).text(); + var $ = load(html); + const chapters = $('.zjlist a'); + let index = 0; + let catalog = []; + chapters.each(function (i, elem) { + var item = $(this) + catalog.push({ + index: index++, + title: item.text(), + path: path.replace(source.bookBaseUrl, '') + item.attr("href") + }); + }); + return catalog; +} + +async function chapterContent (path) { + var html = await (await request(source.bookBaseUrl + path, { decode: 'gbk' })).text(); + const $ = load(html); + let content = htmlToString($("#content").html()); + return content.replace(/奇快中文网[\s\S]*?最新章节!/g, ''); +} + +async function testSearchBook () { + const result = await searchBook('万族', 1); + console.log("搜索结果:", result); + return result; +} + +async function testBookDetails () { + const result = await testSearchBook(); + const detatil = await bookDetails(result.bookList[0].path); + console.log("书籍详情:", detatil); + return detatil; +} + +async function testBookCatalog () { + const info = await testBookDetails(); + const result = await bookCatalog(info.path); + console.log("书籍目录:", result); + return result; +} + +async function testChapterContent () { + const catalog = await testBookCatalog(); + const result = await chapterContent(catalog[0].path); + console.log("章节内容:", result); +} + +await testChapterContent(); +debugger; + -- Gitee From 46f958f42c56079ea839c07bddbae4312c42d911 Mon Sep 17 00:00:00 2001 From: oyzy Date: Mon, 25 Sep 2023 15:05:46 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=96=B0=E5=A2=9E=2067=E4=B9=A6=E5=90=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/crawler.13.js | 14 ++++- src/crawler.14.js | 133 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 src/crawler.14.js diff --git a/src/crawler.13.js b/src/crawler.13.js index 92ffc70..0afa732 100644 --- a/src/crawler.13.js +++ b/src/crawler.13.js @@ -43,6 +43,16 @@ async function searchBook (keyword, pageNum) { var doc = load(html); var bookArr = getBoolList(doc); + const pageMax = doc('#pagelink .last').text(); + if (pageMax && Number(pageMax) > 1) { + for (let i = 2; i <= Number(pageMax); i++) { + let url = source.searchBaseUrl + '/modules/article/search.php?searchkey=' + keyword + '&page=' + i; + let html = await (await request(url, { urlEncode: 'gb2312', decode: 'gbk' })).text(); + let doc = load(html); + bookArr = bookArr.concat(getBoolList(doc)); + } + } + return { size: bookArr.length, pageNum: 1, @@ -93,7 +103,7 @@ async function chapterContent (path) { } async function testSearchBook () { - const result = await searchBook('万族', 1); + const result = await searchBook('判官', 1); console.log("搜索结果:", result); return result; } @@ -118,6 +128,6 @@ async function testChapterContent () { console.log("章节内容:", result); } -await testChapterContent(); +await testSearchBook(); debugger; diff --git a/src/crawler.14.js b/src/crawler.14.js new file mode 100644 index 0000000..d9a2b8c --- /dev/null +++ b/src/crawler.14.js @@ -0,0 +1,133 @@ +import { load, request, htmlToString } from './common.js' + +const source = { + id: "free_book_js/13", + name: "67书吧", + portalUrl: "http://www.67shu.org", + searchBaseUrl: "http://www.67shu.org", + bookBaseUrl: "http://www.67shu.org", + active: true +}; + + +async function searchBook (keyword, pageNum) { + function getBoolList ($) { + const bookArr = []; + if ($('.zjlist').length > 0) {//存在目录 + bookArr.push({ + name: $("meta[property='og:novel:book_name']").attr('content'), + author: $("meta[property='og:novel:author']").attr('content'), + path: $("meta[property='og:novel:read_url']").attr('content').replace(source.bookBaseUrl, ''), + sourceId: source.id + }); + } else { + const alistboxArr = $("#main tr"); + alistboxArr.each(function (index, elem) { + if (index != 0) { + let tds = $(this).find("td"); + let nameAndPath = tds.eq(0).find("a"); + bookArr.push({ + name: nameAndPath.text(), + author: tds.eq(2).text(), + path: nameAndPath.attr('href').replace(source.bookBaseUrl, ''), + sourceId: source.id + }); + } + }) + } + return bookArr; + } + + let url = source.searchBaseUrl + '/modules/article/search.php?searchkey=' + keyword; + var html = await (await request(url, { urlEncode: 'gb2312', decode: 'gbk' })).text(); + var doc = load(html); + var bookArr = getBoolList(doc); + + const pageMax = doc('#pagelink .last').text(); + if (pageMax && Number(pageMax) > 1) { + for (let i = 2; i <= Number(pageMax); i++) { + let url = source.searchBaseUrl + '/modules/article/search.php?searchkey=' + keyword + '&page=' + i; + let html = await (await request(url, { urlEncode: 'gb2312', decode: 'gbk' })).text(); + let doc = load(html); + bookArr = bookArr.concat(getBoolList(doc)); + } + } + + return { + size: bookArr.length, + pageNum: 1, + pages: 1, + sourceId: source.id, + keyword: keyword, + bookList: bookArr + }; +} + +async function bookDetails (path) { + var html = await (await request(source.bookBaseUrl + path, { decode: 'gbk' })).text(); + let $ = load(html); + + return { + name: $("meta[property='og:novel:book_name']").attr('content'), + author: $("meta[property='og:novel:author']").attr('content'), + path: path, + sourceId: source.id, + intro: $("meta[property='og:description']").attr('content'), + coverUrl: $("meta[property='og:image']").attr('content'), + type: $("meta[property='og:novel:category']").attr('content'), + }; +} + +async function bookCatalog (path) { + var html = await (await request(source.bookBaseUrl + path, { decode: 'gbk' })).text(); + var $ = load(html); + const chapters = $('.zjlist a'); + let index = 0; + let catalog = []; + chapters.each(function (i, elem) { + var item = $(this) + catalog.push({ + index: index++, + title: item.text(), + path: path.replace(source.bookBaseUrl, '') + item.attr("href") + }); + }); + return catalog; +} + +async function chapterContent (path) { + var html = await (await request(source.bookBaseUrl + path, { decode: 'gbk' })).text(); + const $ = load(html); + let content = htmlToString($("#content").html()); + return content.replace(/67书吧[\s\S]*?最新章节!/g, ''); +} + +async function testSearchBook () { + const result = await searchBook('万族', 1); + console.log("搜索结果:", result); + return result; +} + +async function testBookDetails () { + const result = await testSearchBook(); + const detatil = await bookDetails(result.bookList[0].path); + console.log("书籍详情:", detatil); + return detatil; +} + +async function testBookCatalog () { + const info = await testBookDetails(); + const result = await bookCatalog(info.path); + console.log("书籍目录:", result); + return result; +} + +async function testChapterContent () { + const catalog = await testBookCatalog(); + const result = await chapterContent(catalog[0].path); + console.log("章节内容:", result); +} + +await testChapterContent(); +debugger; + -- Gitee From 110ce18c95035586932e2134c4de064771ab8f90 Mon Sep 17 00:00:00 2001 From: oyzy Date: Mon, 25 Sep 2023 15:23:43 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=BD=B1=E4=B9=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/crawler.14.js | 2 +- src/crawler.15.js | 112 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 src/crawler.15.js diff --git a/src/crawler.14.js b/src/crawler.14.js index d9a2b8c..c845110 100644 --- a/src/crawler.14.js +++ b/src/crawler.14.js @@ -1,7 +1,7 @@ import { load, request, htmlToString } from './common.js' const source = { - id: "free_book_js/13", + id: "free_book_js/14", name: "67书吧", portalUrl: "http://www.67shu.org", searchBaseUrl: "http://www.67shu.org", diff --git a/src/crawler.15.js b/src/crawler.15.js new file mode 100644 index 0000000..2dc8bb7 --- /dev/null +++ b/src/crawler.15.js @@ -0,0 +1,112 @@ +import { load, request, htmlToString } from './common.js' + +const source = { + id: "free_book_js/15", + name: "影书", + portalUrl: "https://www.yingsx.com", + searchBaseUrl: "https://www.yingsx.com", + bookBaseUrl: "https://www.yingsx.com", + active: true +}; + +async function searchBook (keyword, pageNum) { + function getBoolList ($) { + const bookArr = []; + const alistboxArr = $("#main ul li"); + alistboxArr.each(function (index, elem) { + if (index != 0) { + let tds = $(this).find("span"); + let nameAndPath = tds.eq(1).find("a"); + bookArr.push({ + name: nameAndPath.text(), + author: tds.eq(3).text(), + path: nameAndPath.attr("href"), + sourceId: source.id + }); + } + }) + return bookArr; + } + let url = source.searchBaseUrl + '/cse/search?q=' + encodeURIComponent(keyword); + var html = await (await request(url)).text(); + var doc = load(html); + var bookArr = getBoolList(doc); + + return { + size: bookArr.length, + pageNum: 1, + pages: 1, + sourceId: source.id, + keyword: keyword, + bookList: bookArr + }; +} + +async function bookDetails (path) { + var html = await (await request(source.bookBaseUrl + path)).text(); + let $ = load(html); + + return { + name: $("meta[property='og:novel:book_name']").attr('content'), + author: $("meta[property='og:novel:author']").attr('content'), + path: path, + sourceId: source.id, + intro: $("meta[property='og:description']").attr('content'), + coverUrl: $("meta[property='og:image']").attr('content'), + type: $("meta[property='og:novel:category']").attr('content'), + }; +} + +async function bookCatalog (path) { + var html = await (await request(source.bookBaseUrl + path)).text(); + var $ = load(html); + const chapters = $('#list dt:eq(1)').nextAll(); + let index = 0; + let catalog = []; + chapters.each(function (i, elem) { + var item = $(this).find('a') + catalog.push({ + index: index++, + title: item.text(), + path: item.attr("href") + }); + }); + return catalog; +} + +async function chapterContent (path) { + var html = await (await request(source.bookBaseUrl + path)).text(); + const $ = load(html); + let content = $("#content").html(); + return htmlToString(content); +} + +async function testSearchBook () { + const result = await searchBook('掌灯判官', 1); + console.log("搜索结果:", result); + return result; +} + +async function testBookDetails () { + const result = await testSearchBook(); + const detatil = await bookDetails(result.bookList[0].path); + console.log("书籍详情:", detatil); + return detatil; +} + +async function testBookCatalog () { + const info = await testBookDetails(); + const result = await bookCatalog(info.path); + console.log("书籍目录:", result); + return result; +} + +async function testChapterContent () { + const catalog = await testBookCatalog(); + const result = await chapterContent(catalog[0].path); + console.log("章节内容:", result); +} + +await testChapterContent(); +debugger; + -- Gitee