Skip to content

Commit

Permalink
Merge pull request #222 from jiaojiaodubai/E-Tiller
Browse files Browse the repository at this point in the history
add (E-Tiller.js) 新增勤云科技 (#64)(#106)
  • Loading branch information
l0o0 authored Nov 24, 2023
2 parents 2f3d210 + 9563104 commit 10f1f83
Show file tree
Hide file tree
Showing 3 changed files with 360 additions and 0 deletions.
356 changes: 356 additions & 0 deletions E-Tiller.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,356 @@
/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2022 jiaojiaodubai23@gmail.com
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/

const WEBTYPE = [
{
name: 'table_richsapn',
pagekey: 'table.front_table #Author',
issingle: true
},
// 缺失关键信息,无法解析
// {
// name: 'table_lessspan',
// pagekey: '#QueryUI td > table td > table:nth-child(1) td > table tr:nth-child(4) > td > table #FileTitle',
// issingle: true
// },
{
name: 'exportable',
pagekey: '#ExportUrl',
issingle: true
},
{
name: 'view_abstract',
pagekey: 'a[href*="reader/view_abstract.aspx?file_no="]',
issingle: false
},
{
name: 'artical_abstract',
pagekey: 'a[href*="/article/abstract/"]',
issingle: false
}
];

function detectWeb(doc, _url) {
var atts = [
'class',
'id'
];
let insite = atts.some((element) => {
let foots = Array.from(doc.querySelectorAll(`div[${element}*="foot"]`));
return foots.some(foot => (foot.innerText.match(/()?.*/)));
});
if (!insite) return false;
let validtype = WEBTYPE.find(element => (
doc.querySelector(element.pagekey)
));
// Z.debug(`detect type as\n${JSON.stringify(validtype)}`);
if (!validtype) return false;
if (validtype.issingle) {
return 'journalArticle';
}
else if (getSearchResults(doc, validtype, true)) {
return 'multiple';
}
return false;
}

function getSearchResults(doc, type, checkOnly) {
var items = {};
var found = false;
let tempRows = Array.from(doc.querySelectorAll(type.pagekey));
var rows = tempRows.filter(element => (!(element.textContent.startsWith('摘要'))));
for (let row of rows) {
let href = row.href;
let title = ZU.trimInternal(row.textContent);
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
// Z.debug(`add\n${href}\n${title}\nas item`);
}
return found ? items : false;
}

async function doWeb(doc, url) {
let validtype = WEBTYPE.find(element => (
doc.querySelector(element.pagekey)
));
if (detectWeb(doc, url) == 'multiple') {
let items = await Zotero.selectItems(getSearchResults(doc, validtype, false));
if (!items) return;
for (let url of Object.keys(items)) {
await doWeb(await requestDocument(url), url);
}
}
else {
switch (validtype.name) {
case 'table_richsapn':
await scrapeElement(doc, url);
break;
case 'exportable':
await scrapeRis(doc, url);
break;
default:
break;
}
}
}

const TABLEIDMAP = {
title: { path: '#FileTitle' },
titleTranslation: { path: '#EnTitle' },
abstractNote: { path: '#Abstract' },
abstractTranslation: { path: '#EnAbstract' },
publicationTitle: {
path: '#ReferenceText',
callback: function (text) {
text = text.split('.').split(',')[0];
return text;
}
},
volume: {
path: '#ReferenceText',
callback: function (text) {
text = text.split('.')
.pop().split(',')
.pop();
return text.match(/\d+(?=())/)[0];
}
},
issue: {
path: '#ReferenceText',
callback: function (text) {
text = text.split('.').pop().split(',')
.pop();
return text.match(/\(\d+\)/)[0].slice(1, -1);
}
},
pages: {
path: '#ReferenceText',
callback: function (text) {
return text.split(':').pop();
}
},
date: {
path: '#ReferenceText',
callback: function (text) {
return text.split('.').pop().split(',')[0];
}
},
DOI: { path: '#DOI' },
creators: {
path: '#Author > table tr td:first-of-type',
multiple: true,
callback: function (arr) {
if (arr[0] == '作者') arr.shift();
arr = arr.map(element => (element.replace(/\d/g, '')));
arr = cleanArr(arr);
arr = arr.filter(element => (element.length > 1));
return arr.map(element => (matchCreator(element)));
}
},
tags: {
path: '#KeyWord a',
multiple: true,
callback: function (arr) {
arr = cleanArr(arr);
return arr.map(keyword => (
{ tag: keyword }
));
}
}
};

function cleanArr(arr) {
let seprator = /[,;]\s?/;
if (arr.length == 1 && arr[0].match(seprator)) {
arr = arr[0].split(seprator);
}
return arr;
}

function matchCreator(creator) {
// Z.debug(creators);
if (/[A-Za-z]/.test(creator)) {
creator = ZU.cleanAuthor(creator, "author");
}
else {
creator = creator.replace(/\s/g, '');
creator = {
lastName: creator,
creatorType: "author",
fieldMode: 1
};
}
return creator;
}

async function scrapeElement(doc, url = doc.location.href) {
var newItem = new Z.Item('journalArticle');
for (const field in TABLEIDMAP) {
const recipe = TABLEIDMAP[field];
// Z.debug(`for ${field}, use selector path ${recipe.path}`);
var result;
try {
if (recipe.multiple) {
result = Array.from(doc.querySelectorAll(recipe.path)).map(
element => (element.innerText)
);
}
else {
result = doc.querySelector(recipe.path);
result = result.innerText;
}
if (recipe.callback) {
result = recipe.callback(result);
}
// Z.debug(`return result:`);
// Z.debug(result);
}
catch (error) {
result = "";
}
newItem[field] = result;
}
// 有些网页加载异常就会获取不到标题,只能用英文标题暂替
if (newItem.title == '') newItem.title = newItem.titleTranslation;
var pdfURL = doc.querySelector('#URL').href;
newItem.attachments.push({
url: pdfURL,
title: 'Full Text PDF',
mimeType: 'application/pdf'
});
newItem.attachments.push({
url: url,
title: 'Snapshot',
mimeType: 'text/html'
});
newItem.complete();
}

const METAMAP = {
volume: {
tag: 'name',
value: 'citation_volume'
},
issue: {
tag: 'name',
value: 'citation_issue'
},
date: {
tag: 'name',
value: 'citation_date'
},
journalAbbreviation: {
tag: 'name',
value: 'citation_journal_abbrev'
},
language: {
tag: 'http-equiv',
value: 'Content-Language'
},
ISSN: {
tag: 'name',
value: 'citation_issn'
},
firstpage: {
tag: 'name',
value: 'citation_firstpage'
},
lastpage: {
tag: 'name',
value: 'citation_lastpage'
}
};

async function scrapeRis(doc, url = doc.location.href) {
var risURL = doc.querySelector('#ExportUrl').href.split('/');
let id = risURL.pop();
risURL = risURL.join('/');
// Z.debug(risURL);
let pdfURL = doc.querySelector('#PdfUrl').href;
// Z.debug(pdfURL);
let risText = await requestText(
risURL,
{
method: 'POST',
body: `export_type=ris&include_content=2&article_list=${id}&action_type=export`
});
let translator = Zotero.loadTranslator('import');
translator.setTranslator('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7'); // RIS
translator.setString(risText);
translator.setHandler('itemDone', (_obj, item) => {
for (const field in METAMAP) {
const recipe = METAMAP[field];
let result;
try {
result = doc.querySelector(`head > meta[${recipe.tag}="${recipe.value}"]`).content;
}
catch (error) {
result = '';
}
// Z.debug(`in field ${field},`)
// Z.debug(`we already have:\n${item[field]}`);
// Z.debug(`and now we get from meta:\n${result}`);
if (!item[field] && result) {
item[field] = result;
}
}
item.creators = item.creators.map((creator) => {
if (!/[A-Za-z]/.test((creator).lastName)) {
return matchCreator((creator).lastName);
}
return (creator);
});
item.pages = (function () {
let firstpage = item.firstpage;
let lastpage = item.lastpage;
delete item.firstpage;
delete item.lastpage;
if (firstpage && lastpage) {
return `${firstpage}-${lastpage}`;
}
else if (firstpage || lastpage) {
return [firstpage, lastpage].find(page => (page));
}
else {
return '';
}
})();
item.language = (['zh', 'zh-cn'].includes(item.language)) ? 'zh-CN' : 'en-US';
item.attachments.push({
url: pdfURL,
title: 'Full Text PDF',
mimeType: 'application/pdf'
});
item.attachments.push({
url: url,
title: 'Snapshot',
mimeType: 'text/html'
});
item.complete();
});
await translator.translate();
}


3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@
- 摘要信息:作者简介 + 内容简介
- 目录信息:目录存放至note附件里
- [详细截图](https://github.com/Captain2021/myTranslator/tree/main)
+ 勤云科技 -> [E-Tiller](/E-Tiller.js)
- 支持最新、次新版的勤云科技期刊网站
- 对照meta标签补全导出结果
+ 仁和汇智 -> [RHHZ.js](./RHHZ.js)
- [x] 抓取引文信息
- [x] 支持单条目
Expand Down
1 change: 1 addition & 0 deletions data/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"PKULaw": "北大法宝",
"China Judgements Online": "中国裁判文书网",
"PubScholar": "PubScholar公益学术平台",
"E-Tiller": "勤云科技"
"RHHZ": "仁和汇智"
"ChinaXiv": "中国科学院科技论文预发布平台",
"National Science and Technology Library - China": "国家科技图书文献中心"
Expand Down

0 comments on commit 10f1f83

Please sign in to comment.