Merge pull request #993 from cyian-1756/chanRipperCleanUP

Cleaned up chan ripper and removed dead chans
This commit is contained in:
cyian-1756 2018-10-08 10:30:41 -05:00 committed by GitHub
commit 00cab660a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 96 additions and 42 deletions

View File

@ -17,10 +17,16 @@ import org.jsoup.nodes.Element;
public class ChanRipper extends AbstractHTMLRipper {
private static List<ChanSite> explicit_domains = Arrays.asList(
new ChanSite(Arrays.asList("boards.4chan.org"), Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
new ChanSite(Arrays.asList("4archive.org"), Arrays.asList("imgur.com")),
new ChanSite(Arrays.asList("archive.4plebs.org"), Arrays.asList("img.4plebs.org")),
new ChanSite(Arrays.asList("yuki.la"), Arrays.asList("55chan.org"))
new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
new ChanSite("4archive.org", "imgur.com"),
new ChanSite("archive.4plebs.org", "img.4plebs.org"),
new ChanSite("yuki.la", "ii.yuki.la"),
new ChanSite("55chan.org"),
new ChanSite("desuchan.net"),
new ChanSite("boards.420chan.org"),
new ChanSite("7chan.org"),
new ChanSite("desuarchive.org", "desu-usergeneratedcontent.xyz"),
new ChanSite("8ch.net", "media.8ch.net")
);
private static List<String> url_piece_blacklist = Arrays.asList(
@ -85,27 +91,6 @@ public class ChanRipper extends AbstractHTMLRipper {
}
}
if (url.toExternalForm().contains("desuchan.net") && url.toExternalForm().contains("/res/")) {
return true;
}
if (url.toExternalForm().contains("boards.420chan.org") && url.toExternalForm().contains("/res/")) {
return true;
}
if (url.toExternalForm().contains("7chan.org") && url.toExternalForm().contains("/res/")) {
return true;
}
if (url.toExternalForm().contains("xchan.pw") && url.toExternalForm().contains("/board/")) {
return true;
}
if (url.toExternalForm().contains("desuarchive.org")) {
return true;
}
if (url.toExternalForm().contains("8ch.net") && url.toExternalForm().contains("/res/")) {
return true;
}
if (url.toExternalForm().contains("55chan.org") && url.toExternalForm().contains("/res/")) {
return true;
}
return false;
}
@ -209,7 +194,7 @@ public class ChanRipper extends AbstractHTMLRipper {
}
if (self_hosted || generalChanSite) {
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm|mp4)$", Pattern.CASE_INSENSITIVE);
m = p.matcher(href);
if (m.matches()) {
if (href.startsWith("//")) {

View File

@ -1,5 +1,6 @@
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
import java.util.Arrays;
import java.util.List;
public class ChanSite {
@ -19,6 +20,36 @@ public class ChanSite {
cdnDomains = CdnDomains;
}
public ChanSite(String Domain, List<String> CdnDomains) {
if (Domain.isEmpty()) {
throw new IllegalArgumentException("Domains");
}
if (CdnDomains.isEmpty()) {
throw new IllegalArgumentException("CdnDomains");
}
domains = Arrays.asList(Domain);
cdnDomains = CdnDomains;
}
public ChanSite(String Domain, String CdnDomain) {
if (Domain.isEmpty()) {
throw new IllegalArgumentException("Domains");
}
if (CdnDomain.isEmpty()) {
throw new IllegalArgumentException("CdnDomains");
}
domains = Arrays.asList(Domain);
cdnDomains = Arrays.asList(CdnDomain);
}
public ChanSite(String Domain) {
if (Domain.isEmpty()) {
throw new IllegalArgumentException("Domains");
}
domains = Arrays.asList(Domain);
cdnDomains = Arrays.asList(Domain);
}
public ChanSite(List<String> Domains) {
if (Domains.isEmpty()) {
throw new IllegalArgumentException("Domains");

View File

@ -1,11 +1,14 @@
package com.rarchives.ripme.tst.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import com.rarchives.ripme.ripper.rippers.ChanRipper;
import com.rarchives.ripme.utils.Http;
import org.jsoup.nodes.Document;
public class ChanRipperTest extends RippersTest {
@ -29,7 +32,6 @@ public class ChanRipperTest extends RippersTest {
passURLs.add(new URL("https://boards.4chan.org/hr/thread/3015701"));
passURLs.add(new URL("https://boards.420chan.org/420/res/232066.php"));
passURLs.add(new URL("http://7chan.org/gif/res/25873.html"));
passURLs.add(new URL("https://xchan.pw/board/porn/thread/874116/"));
for (URL url : passURLs) {
ChanRipper ripper = new ChanRipper(url);
ripper.setup();
@ -42,24 +44,26 @@ public class ChanRipperTest extends RippersTest {
public void testChanRipper() throws IOException {
List<URL> contentURLs = new ArrayList<>();
// URLs that should return more than 1 image
//contentURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
//contentURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
//contentURLs.add(new URL("http://archive.4plebs.org/s4s/thread/3005257/"));
//contentURLs.add(new URL("http://drawchan.net/dc/dw/res/114910.html"));
// Most *chans have volatile threads & can't be trusted for integration testing.
//contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
//contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
//contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
// xchan has an HTTPS certificaiton error...
//contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
contentURLs.add(new URL(getRandomThreadDesuarchive()));
for (URL url : contentURLs) {
ChanRipper ripper = new ChanRipper(url);
testRipper(ripper);
testChanRipper(ripper);
}
}
/**
*
* @return String returns a url to a active desuarchive.org tread as a string
*/
public String getRandomThreadDesuarchive() {
try {
Document doc = Http.url(new URL("https://desuarchive.org/wsg/")).get();
System.out.println(doc);
return doc.select("div.post_data > a").first().attr("href");
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}

View File

@ -2,7 +2,9 @@ package com.rarchives.ripme.tst.ripper.rippers;
import java.io.File;
import java.io.IOException;
import java.util.List;
import com.rarchives.ripme.ripper.rippers.ChanRipper;
import junit.framework.TestCase;
import org.apache.log4j.ConsoleAppender;
@ -52,6 +54,38 @@ public class RippersTest extends TestCase {
}
}
// We have a special test for chan rippers because we can't assume that content will be downloadable, as content
// is often removed within mere hours of it being posted. So instead of trying to download any content we just check
// that we found links to it
void testChanRipper(ChanRipper ripper) {
try {
// Turn on Debug logging
((ConsoleAppender)Logger.getRootLogger().getAppender("stdout")).setThreshold(Level.DEBUG);
// Decrease timeout
Utils.setConfigInteger("page.timeout", 20 * 1000);
ripper.setup();
ripper.markAsTest();
List<String> foundUrls = ripper.getURLsFromPage(ripper.getFirstPage());
assertTrue("Failed to find single url on page " + ripper.getURL(), foundUrls.size() >= 1);
} catch (IOException e) {
if (e.getMessage().contains("Ripping interrupted")) {
// We expect some rips to get interrupted
}
else {
e.printStackTrace();
fail("Failed to rip " + ripper.getURL() + " : " + e.getMessage());
}
} catch (Exception e) {
e.printStackTrace();
fail("Failed to rip " + ripper.getURL() + " : " + e.getMessage());
}
finally {
deleteDir(ripper.getWorkingDir());
}
}
/** File extensions that are safe to delete. */
private static final String[] SAFE_EXTENSIONS =
{"png", "jpg", "jpeg", "gif",