| | |
| | | } |
| | | } |
| | | |
| | | /* if (pdfUrl) { |
| | | if (pdfUrl) { |
| | | return pdfUrl; |
| | | } else */ |
| | | if (textUrl) { |
| | | } else if (textUrl) { |
| | | return textUrl; |
| | | } else { |
| | | book.state = "没有text文件"; |
| | |
| | | async function downloadFile(book, url) { |
| | | console.log(`下载文件: ${url}`); |
| | | const ext = url.split(".").pop().toLowerCase(); |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.txt`; |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | if (fs.existsSync(filepath)) { |
| | | book.state = `下载完成`; |
| | | book.format = ext; |
| | |
| | | book.format = ext; |
| | | book.file = filepath; |
| | | book.url = url; |
| | | console.log(`下载完成:${filepath}`); |
| | | setTimeout(() => { |
| | | if (ext === "gz" || ext === "zip") { |
| | | unzip(_filepath, filepath); |
| | | fs.unlinkSync(_filepath); |
| | | } |
| | | let text = fs.readFileSync(filepath, 'utf-8'); |
| | | text = getTextFromHtml(text); |
| | | fs.writeFileSync(filepath, text, 'utf-8'); |
| | | try { |
| | | fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |
| | | } catch (e) { |
| | | reject(e); |
| | | try { |
| | | out.close(); |
| | | fs.unlink(filepath, (e) => console.error(e)); |
| | | } catch (e) { |
| | | console.error(e); |
| | | } |
| | | } |
| | | }, 1000); |
| | | resolve(true); |
| | | }); |
| | | stream.on("error", (err) => { |
| | |
| | | |
| | | function getBookInfo(book) { |
| | | return retry(async () => { |
| | | const publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`); |
| | | const datePublished = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`); |
| | | let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`); |
| | | pages = pages.split(' / ')[1]; |
| | | book.publisher = publisher; |
| | | book.pubDate = datePublished; |
| | | book.pages = pages; |
| | | book.publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`).catch(e=>0); |
| | | book.pubDate = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`).catch(e=>0); |
| | | let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`).catch(e=>0); |
| | | if (pages) { book.pages = pages.split(' / ')[1]; } |
| | | }); |
| | | } |
| | | |
| | |
| | | try { |
| | | await downloadFile(book, url); |
| | | console.log(`下载完成: ${book.id} ${book.title}`); |
| | | console.log('finish: '+JSON.stringify(book)); |
| | | } catch (e) { } |
| | | successCount++; |
| | | // 等一段时间再下一个 |