Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable JCEF #695

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Try with HTMLUnit
koppor committed Sep 7, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 9e64e0ce12788f7ea7385acce799568bbef0853b
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -255,10 +255,13 @@ dependencies {

implementation 'org.controlsfx:controlsfx:11.2.1'

// region HTTP clients
implementation 'org.htmlunit:htmlunit:4.4.0' // used for web scraping
implementation 'org.jsoup:jsoup:1.18.1'
implementation 'com.konghq:unirest-java-core:4.4.4'
implementation 'com.konghq:unirest-modules-gson:4.4.4'
implementation 'org.apache.httpcomponents.client5:httpclient5:5.3.1'
// endregion

implementation 'org.slf4j:slf4j-api:2.0.16'
implementation 'org.tinylog:tinylog-api:2.7.0'
5 changes: 3 additions & 2 deletions src/main/java/module-info.java
Original file line number Diff line number Diff line change
@@ -90,10 +90,11 @@
requires org.glassfish.hk2.api;

// region: http clients
requires unirest.java.core;
requires unirest.modules.gson;
requires htmlunit;
requires org.apache.httpcomponents.core5.httpcore5;
requires org.jsoup;
requires unirest.java.core;
requires unirest.modules.gson;
// endregion

// region: SQL databases
40 changes: 23 additions & 17 deletions src/main/java/org/jabref/logic/importer/fetcher/ACS.java
Original file line number Diff line number Diff line change
@@ -10,6 +10,9 @@
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.DOI;

import org.htmlunit.BrowserVersion;
import org.htmlunit.WebClient;
import org.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -25,34 +28,37 @@ public class ACS implements FulltextFetcher {
private static final String SOURCE = "https://pubs.acs.org/doi/abs/%s";

/**
* Tries to find a fulltext URL for a given BibTex entry.
* <p>
* Currently only uses the DOI if found.
*
* @param entry The Bibtex entry
* @return The fulltext PDF URL Optional, if found, or an empty Optional if not found.
* @throws NullPointerException if no BibTex entry is given
* @throws java.io.IOException
* Tries to find a fulltext URL for a given BibTeX entry.
* Requires the entry to have a DOI field.
* In case no DOI is present, an empty Optional is returned.
*/
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);

// DOI search
Optional<DOI> doi = entry.getField(StandardField.DOI).flatMap(DOI::parse);

if (!doi.isPresent()) {
return Optional.empty();
}

String source = SOURCE.formatted(doi.get().getDOI());
// Retrieve PDF link
Document html = Jsoup.connect(source).ignoreHttpErrors(true).get();
Element link = html.select("a.button_primary").first();

if (link != null) {
LOGGER.info("Fulltext PDF found @ ACS.");
return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
webClient.getOptions().setSSLClientProtocols("TLSv1.3", "TLSv1.2");
// inspired by https://www.innoq.com/en/blog/2016/01/webscraping/
webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setTimeout(10_000);
webClient.waitForBackgroundJavaScript(5000);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setPrintContentOnFailingStatusCode(true);

HtmlPage page = webClient.getPage(source);
boolean pdfButtonExists = page.querySelectorAll("a[title=\"PDF\"].article__btn__secondary").isEmpty();
if (pdfButtonExists) {
LOGGER.info("Fulltext PDF found at ACS.");
// We "guess" the URL instead of parsing the HTML for the actual link
return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
}
}
return Optional.empty();
}