From 6858565b14930d6def6469314b0e2a0071bcd26a Mon Sep 17 00:00:00 2001 From: Gurkengewuerz Date: Sun, 1 Dec 2019 19:35:02 +0100 Subject: [PATCH] updated crawler --- pom.xml | 16 + .../ripme/ripper/AbstractHTMLRipper.java | 247 ++++- .../ripme/ripper/AbstractJSONRipper.java | 231 ++++- .../ripme/ripper/AbstractRipper.java | 117 ++- .../ripper/AbstractSingleFileRipper.java | 2 +- .../rarchives/ripme/ripper/AlbumRipper.java | 17 +- .../ripme/ripper/DownloadFileThread.java | 166 ++-- .../ripme/ripper/rippers/GfycatRipper.java | 103 ++- .../ripme/ripper/rippers/InstagramRipper.java | 868 +++++++++--------- .../java/com/rarchives/ripme/utils/Utils.java | 37 +- 10 files changed, 1222 insertions(+), 582 deletions(-) diff --git a/pom.xml b/pom.xml index 2aa1d0c9..b6097400 100644 --- a/pom.xml +++ b/pom.xml @@ -11,6 +11,22 @@ UTF-8 + + javax.xml.bind + jaxb-api + 2.3.0 + + + com.sun.xml.bind + jaxb-core + 2.3.0 + + + com.sun.xml.bind + jaxb-impl + 2.3.0 + + junit junit diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index b24017f7..4d17f11e 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -2,21 +2,29 @@ package com.rarchives.ripme.ripper; import java.io.File; import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.Collections; +import java.util.HashMap; import java.util.List; - +import java.util.Map; import org.jsoup.nodes.Document; import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; import com.rarchives.ripme.ui.MainWindow; +import com.rarchives.ripme.ui.RipStatusMessage; /** * Simplified ripper, designed for ripping from sites by parsing HTML. */ -public abstract class AbstractHTMLRipper extends AlbumRipper { +public abstract class AbstractHTMLRipper extends AbstractRipper { + + private Map itemsPending = Collections.synchronizedMap(new HashMap()); + private Map itemsCompleted = Collections.synchronizedMap(new HashMap()); + private Map itemsErrored = Collections.synchronizedMap(new HashMap()); protected AbstractHTMLRipper(URL url) throws IOException { super(url); @@ -93,6 +101,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { // We set doc to null here so the while loop below this doesn't fire doc = null; + LOGGER.debug("Adding items from " + this.url + " to queue"); } while (doc != null) { @@ -176,12 +185,12 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { } waitForThreads(); } - + /** * Gets the file name from the URL - * @param url + * @param url * URL that you want to get the filename from - * @return + * @return * Filename of the URL */ private String fileNameFromURL(URL url) { @@ -195,7 +204,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { return saveAs; } /** - * + * * @param url * Target URL * @param subdirectory @@ -204,7 +213,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { * Text you want to save * @param index * Index in something like an album - * @return + * @return * True if ripped successfully * False if failed */ @@ -226,11 +235,11 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { } saveFileAs = new File( workingDir.getCanonicalPath() - + subdirectory - + File.separator - + getPrefix(index) - + fileName - + ".txt"); + + subdirectory + + File.separator + + getPrefix(index) + + fileName + + ".txt"); // Write the file FileOutputStream out = (new FileOutputStream(saveFileAs)); out.write(text.getBytes()); @@ -246,12 +255,12 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { } return true; } - + /** * Gets prefix based on where in the index it is - * @param index + * @param index * The index in question - * @return + * @return * Returns prefix for a file. (?) */ protected String getPrefix(int index) { @@ -261,4 +270,210 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { } return prefix; } -} + + /* + * ------ Methods copied from AlbumRipper. ------ + * This removes AlbumnRipper's usage from this class. + */ + + protected boolean allowDuplicates() { + return false; + } + + @Override + /** + * Returns total amount of files attempted. + */ + public int getCount() { + return itemsCompleted.size() + itemsErrored.size(); + } + + @Override + /** + * Queues multiple URLs of single images to download from a single Album URL + */ + public boolean addURLToDownload(URL url, File saveAs, String referrer, Map cookies, Boolean getFileExtFromMIME) { + // Only download one file if this is a test. + if (super.isThisATest() && + (itemsPending.size() > 0 || itemsCompleted.size() > 0 || itemsErrored.size() > 0)) { + stop(); + return false; + } + if (!allowDuplicates() + && ( itemsPending.containsKey(url) + || itemsCompleted.containsKey(url) + || itemsErrored.containsKey(url) )) { + // Item is already downloaded/downloading, skip it. + LOGGER.info("[!] Skipping " + url + " -- already attempted: " + Utils.removeCWD(saveAs)); + return false; + } + if (Utils.getConfigBoolean("urls_only.save", false)) { + // Output URL to file + String urlFile = this.workingDir + File.separator + "urls.txt"; + try (FileWriter fw = new FileWriter(urlFile, true)) { + fw.write(url.toExternalForm()); + fw.write(System.lineSeparator()); + itemsCompleted.put(url, new File(urlFile)); + } catch (IOException e) { + LOGGER.error("Error while writing to " + urlFile, e); + } + } + else { + itemsPending.put(url, saveAs); + DownloadFileThread dft = new DownloadFileThread(url, saveAs, this, getFileExtFromMIME); + if (referrer != null) { + dft.setReferrer(referrer); + } + if (cookies != null) { + dft.setCookies(cookies); + } + threadPool.addThread(dft); + } + + return true; + } + + @Override + public boolean addURLToDownload(URL url, File saveAs) { + return addURLToDownload(url, saveAs, null, null, false); + } + + /** + * Queues image to be downloaded and saved. + * Uses filename from URL to decide filename. + * @param url + * URL to download + * @return + * True on success + */ + protected boolean addURLToDownload(URL url) { + // Use empty prefix and empty subdirectory + return addURLToDownload(url, "", ""); + } + + @Override + /** + * Cleans up & tells user about successful download + */ + public void downloadCompleted(URL url, File saveAs) { + if (observer == null) { + return; + } + try { + String path = Utils.removeCWD(saveAs); + RipStatusMessage msg = new RipStatusMessage(STATUS.DOWNLOAD_COMPLETE, path); + itemsPending.remove(url); + itemsCompleted.put(url, saveAs); + observer.update(this, msg); + + checkIfComplete(); + } catch (Exception e) { + LOGGER.error("Exception while updating observer: ", e); + } + } + + @Override + /** + * Cleans up & tells user about failed download. + */ + public void downloadErrored(URL url, String reason) { + if (observer == null) { + return; + } + itemsPending.remove(url); + itemsErrored.put(url, reason); + observer.update(this, new RipStatusMessage(STATUS.DOWNLOAD_ERRORED, url + " : " + reason)); + + checkIfComplete(); + } + + @Override + /** + * Tells user that a single file in the album they wish to download has + * already been downloaded in the past. + */ + public void downloadExists(URL url, File file) { + if (observer == null) { + return; + } + + itemsPending.remove(url); + itemsCompleted.put(url, file); + observer.update(this, new RipStatusMessage(STATUS.DOWNLOAD_WARN, url + " already saved as " + file.getAbsolutePath())); + + checkIfComplete(); + } + + /** + * Notifies observers and updates state if all files have been ripped. + */ + @Override + protected void checkIfComplete() { + if (observer == null) { + return; + } + if (itemsPending.isEmpty()) { + super.checkIfComplete(); + } + } + + /** + * Sets directory to save all ripped files to. + * @param url + * URL to define how the working directory should be saved. + * @throws + * IOException + */ + @Override + public void setWorkingDir(URL url) throws IOException { + String path = Utils.getWorkingDirectory().getCanonicalPath(); + if (!path.endsWith(File.separator)) { + path += File.separator; + } + String title; + if (Utils.getConfigBoolean("album_titles.save", true)) { + title = getAlbumTitle(this.url); + } else { + title = super.getAlbumTitle(this.url); + } + LOGGER.debug("Using album title '" + title + "'"); + + title = Utils.filesystemSafe(title); + path += title; + path = Utils.getOriginalDirectory(path) + File.separator; // check for case sensitive (unix only) + + this.workingDir = new File(path); + if (!this.workingDir.exists()) { + LOGGER.info("[+] Creating directory: " + Utils.removeCWD(this.workingDir)); + this.workingDir.mkdirs(); + } + LOGGER.debug("Set working directory to: " + this.workingDir); + } + + /** + * @return + * Integer between 0 and 100 defining the progress of the album rip. + */ + @Override + public int getCompletionPercentage() { + double total = itemsPending.size() + itemsErrored.size() + itemsCompleted.size(); + return (int) (100 * ( (total - itemsPending.size()) / total)); + } + + /** + * @return + * Human-readable information on the status of the current rip. + */ + @Override + public String getStatusText() { + StringBuilder sb = new StringBuilder(); + sb.append(getCompletionPercentage()) + .append("% ") + .append("- Pending: " ).append(itemsPending.size()) + .append(", Completed: ").append(itemsCompleted.size()) + .append(", Errored: " ).append(itemsErrored.size()); + return sb.toString(); + } + + +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java index 4455270e..19f44240 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractJSONRipper.java @@ -1,19 +1,27 @@ package com.rarchives.ripme.ripper; +import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.Collections; +import java.util.HashMap; import java.util.List; - +import java.util.Map; import org.json.JSONObject; - +import com.rarchives.ripme.ui.RipStatusMessage; import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; /** * Simplified ripper, designed for ripping from sites by parsing JSON. */ -public abstract class AbstractJSONRipper extends AlbumRipper { +public abstract class AbstractJSONRipper extends AbstractRipper { + + private Map itemsPending = Collections.synchronizedMap(new HashMap()); + private Map itemsCompleted = Collections.synchronizedMap(new HashMap()); + private Map itemsErrored = Collections.synchronizedMap(new HashMap()); protected AbstractJSONRipper(URL url) throws IOException { super(url); @@ -56,12 +64,12 @@ public abstract class AbstractJSONRipper extends AlbumRipper { while (json != null) { List imageURLs = getURLsFromJSON(json); - + if (alreadyDownloadedUrls >= Utils.getConfigInteger("history.end_rip_after_already_seen", 1000000000) && !isThisATest()) { - sendUpdate(STATUS.DOWNLOAD_COMPLETE, "Already seen the last " + alreadyDownloadedUrls + " images ending rip"); - break; + sendUpdate(STATUS.DOWNLOAD_COMPLETE, "Already seen the last " + alreadyDownloadedUrls + " images ending rip"); + break; } - + // Remove all but 1 image if (isThisATest()) { while (imageURLs.size() > 1) { @@ -77,7 +85,7 @@ public abstract class AbstractJSONRipper extends AlbumRipper { if (isStopped()) { break; } - + index += 1; LOGGER.debug("Found image url #" + index+ ": " + imageURL); downloadURL(new URL(imageURL), index); @@ -111,4 +119,209 @@ public abstract class AbstractJSONRipper extends AlbumRipper { } return prefix; } -} + + /* + * ------ Methods copied from AlbumRipper ------ + */ + + protected boolean allowDuplicates() { + return false; + } + + @Override + /** + * Returns total amount of files attempted. + */ + public int getCount() { + return itemsCompleted.size() + itemsErrored.size(); + } + + @Override + /** + * Queues multiple URLs of single images to download from a single Album URL + */ + public boolean addURLToDownload(URL url, File saveAs, String referrer, Map cookies, Boolean getFileExtFromMIME) { + // Only download one file if this is a test. + if (super.isThisATest() && + (itemsPending.size() > 0 || itemsCompleted.size() > 0 || itemsErrored.size() > 0)) { + stop(); + return false; + } + if (!allowDuplicates() + && ( itemsPending.containsKey(url) + || itemsCompleted.containsKey(url) + || itemsErrored.containsKey(url) )) { + // Item is already downloaded/downloading, skip it. + LOGGER.info("[!] Skipping " + url + " -- already attempted: " + Utils.removeCWD(saveAs)); + return false; + } + if (Utils.getConfigBoolean("urls_only.save", false)) { + // Output URL to file + String urlFile = this.workingDir + File.separator + "urls.txt"; + try (FileWriter fw = new FileWriter(urlFile, true)) { + fw.write(url.toExternalForm()); + fw.write(System.lineSeparator()); + itemsCompleted.put(url, new File(urlFile)); + } catch (IOException e) { + LOGGER.error("Error while writing to " + urlFile, e); + } + } + else { + itemsPending.put(url, saveAs); + DownloadFileThread dft = new DownloadFileThread(url, saveAs, this, getFileExtFromMIME); + if (referrer != null) { + dft.setReferrer(referrer); + } + if (cookies != null) { + dft.setCookies(cookies); + } + threadPool.addThread(dft); + } + + return true; + } + + @Override + public boolean addURLToDownload(URL url, File saveAs) { + return addURLToDownload(url, saveAs, null, null, false); + } + + /** + * Queues image to be downloaded and saved. + * Uses filename from URL to decide filename. + * @param url + * URL to download + * @return + * True on success + */ + protected boolean addURLToDownload(URL url) { + // Use empty prefix and empty subdirectory + return addURLToDownload(url, "", ""); + } + + @Override + /** + * Cleans up & tells user about successful download + */ + public void downloadCompleted(URL url, File saveAs) { + if (observer == null) { + return; + } + try { + String path = Utils.removeCWD(saveAs); + RipStatusMessage msg = new RipStatusMessage(STATUS.DOWNLOAD_COMPLETE, path); + itemsPending.remove(url); + itemsCompleted.put(url, saveAs); + observer.update(this, msg); + + checkIfComplete(); + } catch (Exception e) { + LOGGER.error("Exception while updating observer: ", e); + } + } + + @Override + /** + * Cleans up & tells user about failed download. + */ + public void downloadErrored(URL url, String reason) { + if (observer == null) { + return; + } + itemsPending.remove(url); + itemsErrored.put(url, reason); + observer.update(this, new RipStatusMessage(STATUS.DOWNLOAD_ERRORED, url + " : " + reason)); + + checkIfComplete(); + } + + @Override + /** + * Tells user that a single file in the album they wish to download has + * already been downloaded in the past. + */ + public void downloadExists(URL url, File file) { + if (observer == null) { + return; + } + + itemsPending.remove(url); + itemsCompleted.put(url, file); + observer.update(this, new RipStatusMessage(STATUS.DOWNLOAD_WARN, url + " already saved as " + file.getAbsolutePath())); + + checkIfComplete(); + } + + /** + * Notifies observers and updates state if all files have been ripped. + */ + @Override + protected void checkIfComplete() { + if (observer == null) { + return; + } + if (itemsPending.isEmpty()) { + super.checkIfComplete(); + } + } + + /** + * Sets directory to save all ripped files to. + * @param url + * URL to define how the working directory should be saved. + * @throws + * IOException + */ + @Override + public void setWorkingDir(URL url) throws IOException { + String path = Utils.getWorkingDirectory().getCanonicalPath(); + if (!path.endsWith(File.separator)) { + path += File.separator; + } + String title; + if (Utils.getConfigBoolean("album_titles.save", true)) { + title = getAlbumTitle(this.url); + } else { + title = super.getAlbumTitle(this.url); + } + LOGGER.debug("Using album title '" + title + "'"); + + title = Utils.filesystemSafe(title); + path += title; + path = Utils.getOriginalDirectory(path) + File.separator; // check for case sensitive (unix only) + + this.workingDir = new File(path); + if (!this.workingDir.exists()) { + LOGGER.info("[+] Creating directory: " + Utils.removeCWD(this.workingDir)); + this.workingDir.mkdirs(); + } + LOGGER.debug("Set working directory to: " + this.workingDir); + } + + /** + * @return + * Integer between 0 and 100 defining the progress of the album rip. + */ + @Override + public int getCompletionPercentage() { + double total = itemsPending.size() + itemsErrored.size() + itemsCompleted.size(); + return (int) (100 * ( (total - itemsPending.size()) / total)); + } + + /** + * @return + * Human-readable information on the status of the current rip. + */ + @Override + public String getStatusText() { + StringBuilder sb = new StringBuilder(); + sb.append(getCompletionPercentage()) + .append("% ") + .append("- Pending: " ).append(itemsPending.size()) + .append(", Completed: ").append(itemsCompleted.size()) + .append(", Errored: " ).append(itemsErrored.size()); + return sb.toString(); + } + + +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index 95643b4c..19d1bf77 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -1,7 +1,11 @@ package com.rarchives.ripme.ripper; import java.awt.Desktop; -import java.io.*; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; import java.lang.reflect.Constructor; import java.net.MalformedURLException; import java.net.URL; @@ -9,24 +13,20 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Observable; - -import com.rarchives.ripme.App; +import java.util.Scanner; import org.apache.log4j.FileAppender; import org.apache.log4j.Logger; import org.jsoup.HttpStatusException; - +import com.rarchives.ripme.App; import com.rarchives.ripme.ui.RipStatusComplete; import com.rarchives.ripme.ui.RipStatusHandler; import com.rarchives.ripme.ui.RipStatusMessage; import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; -import java.io.File; -import java.util.Scanner; - public abstract class AbstractRipper - extends Observable - implements RipperInterface, Runnable { + extends Observable + implements RipperInterface, Runnable { protected static final Logger LOGGER = Logger.getLogger(AbstractRipper.class); private final String URLHistoryFile = Utils.getURLHistoryFile(); @@ -67,7 +67,7 @@ public abstract class AbstractRipper * Adds a URL to the url history file * @param downloadedURL URL to check if downloaded */ - private void writeDownloadedURL(String downloadedURL) throws IOException { + protected void writeDownloadedURL(String downloadedURL) throws IOException { // If "save urls only" is checked don't write to the url history file if (Utils.getConfigBoolean("urls_only.save", false)) { return; @@ -123,15 +123,15 @@ public abstract class AbstractRipper public String normalizeUrl(String url) { return url; } - + /** * Checks to see if Ripme has already downloaded a URL * @param url URL to check if downloaded - * @return + * @return * Returns true if previously downloaded. * Returns false if not yet downloaded. */ - private boolean hasDownloadedURL(String url) { + protected boolean hasDownloadedURL(String url) { File file = new File(URLHistoryFile); url = normalizeUrl(url); @@ -172,7 +172,7 @@ public abstract class AbstractRipper * Logger (for debugging) * FileAppender * Threadpool - * @throws IOException + * @throws IOException * Always be prepared. */ public void setup() throws IOException { @@ -218,6 +218,44 @@ public abstract class AbstractRipper protected abstract boolean addURLToDownload(URL url, File saveAs, String referrer, Map cookies, Boolean getFileExtFromMIME); + /** + * Queues image to be downloaded and saved. + * @param url + * URL of the file + * @param options + * A map containing any changes to the default options. + * Options are getFileExtFromMIME, prefix, subdirectory, referrer, fileName, extension, getFileExtFromMIME. + * getFileExtFromMIME should be "true" or "false" + * @param cookies + * The cookies to send to the server while downloading this file. + * @return + * True if downloaded successfully + * False if failed to download + */ + protected boolean addURLToDownload(URL url, Map options, Map cookies) { + // Bit of a hack but this lets us pass a bool using a map + boolean useMIME = options.getOrDefault("getFileExtFromMIME", "false").toLowerCase().equals("true"); + return addURLToDownload(url, options.getOrDefault("prefix", ""), options.getOrDefault("subdirectory", ""), options.getOrDefault("referrer", null), + cookies, options.getOrDefault("fileName", null), options.getOrDefault("extension", null), useMIME); + } + + + /** + * Queues image to be downloaded and saved. + * @param url + * URL of the file + * @param options + * A map containing any changes to the default options. + * Options are getFileExtFromMIME, prefix, subdirectory, referrer, fileName, extension, getFileExtFromMIME. + * getFileExtFromMIME should be "true" or "false" + * @return + * True if downloaded successfully + * False if failed to download + */ + protected boolean addURLToDownload(URL url, Map options) { + return addURLToDownload(url, options, null); + } + /** * Queues image to be downloaded and saved. * @param url @@ -232,11 +270,27 @@ public abstract class AbstractRipper * The cookies to send to the server while downloading this file. * @param fileName * The name that file will be written to - * @return + * @return * True if downloaded successfully * False if failed to download */ protected boolean addURLToDownload(URL url, String prefix, String subdirectory, String referrer, Map cookies, String fileName, String extension, Boolean getFileExtFromMIME) { + // A common bug is rippers adding urls that are just "http:". This rejects said urls + if (url.toExternalForm().equals("http:") || url.toExternalForm().equals("https:")) { + LOGGER.info(url.toExternalForm() + " is a invalid url amd will be changed"); + return false; + + } + // Make sure the url doesn't contain any spaces as that can cause a 400 error when requesting the file + if (url.toExternalForm().contains(" ")) { + // If for some reason the url with all spaces encoded as %20 is malformed print an error + try { + url = new URL(url.toExternalForm().replaceAll(" ", "%20")); + } catch (MalformedURLException e) { + LOGGER.error("Unable to remove spaces from url\nURL: " + url.toExternalForm()); + e.printStackTrace(); + } + } // Don't re-add the url if it was downloaded in a previous rip if (Utils.getConfigBoolean("remember.url_history", true) && !isThisATest()) { if (hasDownloadedURL(url.toExternalForm())) { @@ -266,10 +320,10 @@ public abstract class AbstractRipper } saveFileAs = new File( topFolderName - + subdirectory - + File.separator - + prefix - + saveAs); + + subdirectory + + File.separator + + prefix + + saveAs); } catch (IOException e) { LOGGER.error("[!] Error creating save file path for URL '" + url + "':", e); return false; @@ -280,6 +334,7 @@ public abstract class AbstractRipper saveFileAs.getParentFile().mkdirs(); } if (Utils.getConfigBoolean("remember.url_history", true) && !isThisATest()) { + LOGGER.info("Writing " + url.toExternalForm() + " to file"); try { writeDownloadedURL(url.toExternalForm() + "\n"); } catch (IOException e) { @@ -447,7 +502,7 @@ public abstract class AbstractRipper /** * Gets URL - * @return + * @return * Returns URL that wants to be downloaded. */ public URL getURL() { @@ -467,14 +522,14 @@ public abstract class AbstractRipper public abstract void setWorkingDir(URL url) throws IOException; /** - * - * @param url + * + * @param url * The URL you want to get the title of. * @return * host_URLid * e.g. (for a reddit post) * reddit_post_7mg2ur - * @throws MalformedURLException + * @throws MalformedURLException * If any of those damned URLs gets malformed. */ public String getAlbumTitle(URL url) throws MalformedURLException { @@ -493,7 +548,7 @@ public abstract class AbstractRipper public static AbstractRipper getRipper(URL url) throws Exception { for (Constructor constructor : getRipperConstructors("com.rarchives.ripme.ripper.rippers")) { try { - AlbumRipper ripper = (AlbumRipper) constructor.newInstance(url); // by design: can throw ClassCastException + AbstractRipper ripper = (AbstractRipper) constructor.newInstance(url); // by design: can throw ClassCastException LOGGER.debug("Found album ripper: " + ripper.getClass().getName()); return ripper; } catch (Exception e) { @@ -531,7 +586,7 @@ public abstract class AbstractRipper /** * Sends an update message to the relevant observer(s) on this ripper. - * @param status + * @param status * @param message */ public void sendUpdate(STATUS status, Object message) { @@ -540,15 +595,15 @@ public abstract class AbstractRipper } observer.update(this, new RipStatusMessage(status, message)); } - + /** * Get the completion percentage. - * @return + * @return * Percentage complete */ public abstract int getCompletionPercentage(); /** - * @return + * @return * Text for status */ public abstract String getStatusText(); @@ -584,12 +639,12 @@ public abstract class AbstractRipper } } } - + /** * Pauses thread for a set amount of time. * @param milliseconds * Amount of time (in milliseconds) that the thread gets paused for - * @return + * @return * True if paused successfully * False if failed to pause/got interrupted. */ @@ -624,4 +679,4 @@ public abstract class AbstractRipper protected boolean useByteProgessBar() { return false;} // If true ripme will try to resume a broken download for this ripper protected boolean tryResumeDownload() { return false;} -} +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractSingleFileRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractSingleFileRipper.java index f1f8be41..caf69c37 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractSingleFileRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractSingleFileRipper.java @@ -40,4 +40,4 @@ public abstract class AbstractSingleFileRipper extends AbstractHTMLRipper { @Override public boolean useByteProgessBar() {return true;} -} +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/AlbumRipper.java b/src/main/java/com/rarchives/ripme/ripper/AlbumRipper.java index 97943b33..fa6ce3ba 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AlbumRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AlbumRipper.java @@ -59,8 +59,8 @@ public abstract class AlbumRipper extends AbstractRipper { } if (!allowDuplicates() && ( itemsPending.containsKey(url) - || itemsCompleted.containsKey(url) - || itemsErrored.containsKey(url) )) { + || itemsCompleted.containsKey(url) + || itemsErrored.containsKey(url) )) { // Item is already downloaded/downloading, skip it. LOGGER.info("[!] Skipping " + url + " -- already attempted: " + Utils.removeCWD(saveAs)); return false; @@ -70,7 +70,7 @@ public abstract class AlbumRipper extends AbstractRipper { String urlFile = this.workingDir + File.separator + "urls.txt"; try (FileWriter fw = new FileWriter(urlFile, true)) { fw.write(url.toExternalForm()); - fw.write("\n"); + fw.write(System.lineSeparator()); itemsCompleted.put(url, new File(urlFile)); } catch (IOException e) { LOGGER.error("Error while writing to " + urlFile, e); @@ -87,6 +87,7 @@ public abstract class AlbumRipper extends AbstractRipper { } threadPool.addThread(dft); } + return true; } @@ -225,10 +226,10 @@ public abstract class AlbumRipper extends AbstractRipper { public String getStatusText() { StringBuilder sb = new StringBuilder(); sb.append(getCompletionPercentage()) - .append("% ") - .append("- Pending: " ).append(itemsPending.size()) - .append(", Completed: ").append(itemsCompleted.size()) - .append(", Errored: " ).append(itemsErrored.size()); + .append("% ") + .append("- Pending: " ).append(itemsPending.size()) + .append(", Completed: ").append(itemsCompleted.size()) + .append(", Errored: " ).append(itemsErrored.size()); return sb.toString(); } -} +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java index 2f8a1503..3680af6b 100644 --- a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java @@ -1,12 +1,6 @@ package com.rarchives.ripme.ripper; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.lang.reflect.Array; +import java.io.*; import java.net.HttpURLConnection; import java.net.SocketTimeoutException; import java.net.URL; @@ -19,26 +13,22 @@ import java.util.ResourceBundle; import javax.net.ssl.HttpsURLConnection; import com.rarchives.ripme.ui.MainWindow; -import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.jsoup.HttpStatusException; import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; -import com.rarchives.ripme.ripper.AbstractRipper; /** - * Thread for downloading files. - * Includes retry logic, observer notifications, and other goodies. + * Thread for downloading files. Includes retry logic, observer notifications, + * and other goodies. */ class DownloadFileThread extends Thread { - private ResourceBundle rb = MainWindow.rb; - private static final Logger logger = Logger.getLogger(DownloadFileThread.class); private String referrer = ""; - private Map cookies = new HashMap<>(); + private Map cookies = new HashMap<>(); private URL url; private File saveAs; @@ -63,16 +53,19 @@ class DownloadFileThread extends Thread { public void setReferrer(String referrer) { this.referrer = referrer; } - public void setCookies(Map cookies) { + + public void setCookies(Map cookies) { this.cookies = cookies; } - /** - * Attempts to download the file. Retries as needed. - * Notifies observers upon completion/error/warn. + * Attempts to download the file. Retries as needed. Notifies observers upon + * completion/error/warn. */ public void run() { + // First thing we make sure the file name doesn't have any illegal chars in it + saveAs = new File( + saveAs.getParentFile().getAbsolutePath() + File.separator + Utils.sanitizeSaveAs(saveAs.getName())); long fileSize = 0; int bytesTotal = 0; int bytesDownloaded = 0; @@ -85,13 +78,15 @@ class DownloadFileThread extends Thread { observer.downloadErrored(url, rb.getString("download.interrupted")); return; } - if (saveAs.exists() && !observer.tryResumeDownload() && !getFileExtFromMIME || - Utils.fuzzyExists(new File(saveAs.getParent()), saveAs.getName()) && getFileExtFromMIME && !observer.tryResumeDownload()) { + if (saveAs.exists() && !observer.tryResumeDownload() && !getFileExtFromMIME + || Utils.fuzzyExists(new File(saveAs.getParent()), saveAs.getName()) && getFileExtFromMIME + && !observer.tryResumeDownload()) { if (Utils.getConfigBoolean("file.overwrite", false)) { logger.info("[!] " + rb.getString("deleting.existing.file") + prettySaveAs); saveAs.delete(); } else { - logger.info("[!] " + rb.getString("skipping") + url + " -- " + rb.getString("file.already.exists") + ": " + prettySaveAs); + logger.info("[!] " + rb.getString("skipping") + url + " -- " + + rb.getString("file.already.exists") + ": " + prettySaveAs); observer.downloadExists(url, saveAs); return; } @@ -101,7 +96,8 @@ class DownloadFileThread extends Thread { int tries = 0; // Number of attempts to download do { tries += 1; - InputStream bis = null; OutputStream fos = null; + InputStream bis = null; + OutputStream fos = null; try { logger.info(" Downloading file: " + urlToDownload + (tries > 0 ? " Retry #" + tries : "")); observer.sendUpdate(STATUS.DOWNLOAD_STARTED, url.toExternalForm()); @@ -110,16 +106,16 @@ class DownloadFileThread extends Thread { HttpURLConnection huc; if (this.url.toString().startsWith("https")) { huc = (HttpsURLConnection) urlToDownload.openConnection(); - } - else { + } else { huc = (HttpURLConnection) urlToDownload.openConnection(); } huc.setInstanceFollowRedirects(true); - // It is important to set both ConnectTimeout and ReadTimeout. If you don't then ripme will wait forever + // It is important to set both ConnectTimeout and ReadTimeout. If you don't then + // ripme will wait forever // for the server to send data after connecting. huc.setConnectTimeout(TIMEOUT); huc.setReadTimeout(TIMEOUT); - huc.setRequestProperty("accept", "*/*"); + huc.setRequestProperty("accept", "*/*"); if (!referrer.equals("")) { huc.setRequestProperty("Referer", referrer); // Sic } @@ -142,11 +138,13 @@ class DownloadFileThread extends Thread { int statusCode = huc.getResponseCode(); logger.debug("Status code: " + statusCode); + // If the server doesn't allow resuming downloads error out if (statusCode != 206 && observer.tryResumeDownload() && saveAs.exists()) { - // TODO find a better way to handle servers that don't support resuming downloads then just erroring out + // TODO find a better way to handle servers that don't support resuming + // downloads then just erroring out throw new IOException(rb.getString("server.doesnt.support.resuming.downloads")); } - if (statusCode / 100 == 3) { // 3xx Redirect + if (statusCode / 100 == 3) { // 3xx Redirect if (!redirected) { // Don't increment retries on the first redirect tries--; @@ -158,12 +156,15 @@ class DownloadFileThread extends Thread { throw new IOException("Redirect status code " + statusCode + " - redirect to " + location); } if (statusCode / 100 == 4) { // 4xx errors - logger.error("[!] " + rb.getString("nonretriable.status.code") + " " + statusCode + " while downloading from " + url); - observer.downloadErrored(url, rb.getString("nonretriable.status.code") + " " + statusCode + " while downloading " + url.toExternalForm()); + logger.error("[!] " + rb.getString("nonretriable.status.code") + " " + statusCode + + " while downloading from " + url); + observer.downloadErrored(url, rb.getString("nonretriable.status.code") + " " + + statusCode + " while downloading " + url.toExternalForm()); return; // Not retriable, drop out. } if (statusCode / 100 == 5) { // 5xx errors - observer.downloadErrored(url, rb.getString("retriable.status.code") + " " + statusCode + " while downloading " + url.toExternalForm()); + observer.downloadErrored(url, rb.getString("retriable.status.code") + " " + statusCode + + " while downloading " + url.toExternalForm()); // Throw exception so download can be retried throw new IOException(rb.getString("retriable.status.code") + " " + statusCode); } @@ -174,7 +175,8 @@ class DownloadFileThread extends Thread { return; } - // If the ripper is using the bytes progress bar set bytesTotal to huc.getContentLength() + // If the ripper is using the bytes progress bar set bytesTotal to + // huc.getContentLength() if (observer.useByteProgessBar()) { bytesTotal = huc.getContentLength(); observer.setBytesTotal(bytesTotal); @@ -195,14 +197,15 @@ class DownloadFileThread extends Thread { logger.error("Was unable to get content type from stream"); // Try to get the file type from the magic number byte[] magicBytes = new byte[8]; - bis.read(magicBytes,0, 5); + bis.read(magicBytes, 0, 5); bis.reset(); fileExt = Utils.getEXTFromMagic(magicBytes); if (fileExt != null) { saveAs = new File(saveAs.toString() + "." + fileExt); } else { logger.error(rb.getString("was.unable.to.get.content.type.using.magic.number")); - logger.error(rb.getString("magic.number.was") + ": " + Arrays.toString(magicBytes)); + logger.error( + rb.getString("magic.number.was") + ": " + Arrays.toString(magicBytes)); } } } @@ -210,31 +213,54 @@ class DownloadFileThread extends Thread { if (statusCode == 206) { fos = new FileOutputStream(saveAs, true); } else { - fos = new FileOutputStream(saveAs); + try { + fos = new FileOutputStream(saveAs); + } catch (FileNotFoundException e) { + // We do this because some filesystems have a max name length + if (e.getMessage().contains("File name too long")) { + logger.error("The filename " + saveAs.getName() + + " is to long to be saved on this file system."); + logger.info("Shortening filename"); + String[] saveAsSplit = saveAs.getName().split("\\."); + // Get the file extension so when we shorten the file name we don't cut off the + // file extension + String fileExt = saveAsSplit[saveAsSplit.length - 1]; + // The max limit for filenames on Linux with Ext3/4 is 255 bytes + logger.info(saveAs.getName().substring(0, 254 - fileExt.length()) + fileExt); + String filename = saveAs.getName().substring(0, 254 - fileExt.length()) + "." + fileExt; + // We can't just use the new file name as the saveAs because the file name + // doesn't include the + // users save path, so we get the user save path from the old saveAs + saveAs = new File(saveAs.getParentFile().getAbsolutePath() + File.separator + filename); + fos = new FileOutputStream(saveAs); + } else if (saveAs.getAbsolutePath().length() > 259 && Utils.isWindows()) { + // This if is for when the file path has gone above 260 chars which windows does + // not allow + fos = new FileOutputStream( + Utils.shortenSaveAsWindows(saveAs.getParentFile().getPath(), saveAs.getName())); + } + } } byte[] data = new byte[1024 * 256]; int bytesRead; - boolean shouldSkipFileDownload = huc.getContentLength() / 10000000 >= 10; - while ( (bytesRead = bis.read(data)) != -1) { - try { - observer.stopCheck(); - } catch (IOException e) { - observer.downloadErrored(url, rb.getString("download.interrupted")); - return; - } - fos.write(data, 0, bytesRead); - if (observer.useByteProgessBar()) { - bytesDownloaded += bytesRead; - observer.setBytesCompleted(bytesDownloaded); - observer.sendUpdate(STATUS.COMPLETED_BYTES, bytesDownloaded); - } - // If this is a test and we're downloading a large file - if (AbstractRipper.isThisATest() && shouldSkipFileDownload) { - logger.debug("Not downloading whole file because it is over 10mb and this is a test"); - bis.close(); - fos.close(); - break; - + boolean shouldSkipFileDownload = huc.getContentLength() / 1000000 >= 10 && AbstractRipper.isThisATest(); + // If this is a test rip we skip large downloads + if (shouldSkipFileDownload) { + logger.debug("Not downloading whole file because it is over 10mb and this is a test"); + } else { + while ((bytesRead = bis.read(data)) != -1) { + try { + observer.stopCheck(); + } catch (IOException e) { + observer.downloadErrored(url, rb.getString("download.interrupted")); + return; + } + fos.write(data, 0, bytesRead); + if (observer.useByteProgessBar()) { + bytesDownloaded += bytesRead; + observer.setBytesCompleted(bytesDownloaded); + observer.sendUpdate(STATUS.COMPLETED_BYTES, bytesDownloaded); + } } } bis.close(); @@ -249,24 +275,34 @@ class DownloadFileThread extends Thread { logger.debug(rb.getString("http.status.exception"), hse); logger.error("[!] HTTP status " + hse.getStatusCode() + " while downloading from " + urlToDownload); if (hse.getStatusCode() == 404 && Utils.getConfigBoolean("errors.skip404", false)) { - observer.downloadErrored(url, "HTTP status code " + hse.getStatusCode() + " while downloading " + url.toExternalForm()); + observer.downloadErrored(url, + "HTTP status code " + hse.getStatusCode() + " while downloading " + url.toExternalForm()); return; } } catch (IOException e) { logger.debug("IOException", e); - logger.error("[!] " + rb.getString("exception.while.downloading.file") + ": " + url + " - " + e.getMessage()); + logger.error("[!] " + rb.getString("exception.while.downloading.file") + ": " + url + " - " + + e.getMessage()); } finally { // Close any open streams try { - if (bis != null) { bis.close(); } - } catch (IOException e) { } + if (bis != null) { + bis.close(); + } + } catch (IOException e) { + } try { - if (fos != null) { fos.close(); } - } catch (IOException e) { } + if (fos != null) { + fos.close(); + } + } catch (IOException e) { + } } if (tries > this.retries) { - logger.error("[!] " + rb.getString ("exceeded.maximum.retries") + " (" + this.retries + ") for URL " + url); - observer.downloadErrored(url, rb.getString("failed.to.download") + " " + url.toExternalForm()); + logger.error("[!] " + rb.getString("exceeded.maximum.retries") + " (" + this.retries + + ") for URL " + url); + observer.downloadErrored(url, + rb.getString("failed.to.download") + " " + url.toExternalForm()); return; } } while (true); @@ -274,4 +310,4 @@ class DownloadFileThread extends Thread { logger.info("[+] Saved " + url + " as " + this.prettySaveAs); } -} +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java index 9c2db859..e4da448d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java @@ -9,19 +9,27 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.rarchives.ripme.ripper.AbstractSingleFileRipper; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import org.json.JSONArray; +import org.json.JSONObject; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.rarchives.ripme.utils.Http; -public class GfycatRipper extends AbstractSingleFileRipper { +public class GfycatRipper extends AbstractHTMLRipper { private static final String HOST = "gfycat.com"; + String username = ""; + String cursor = ""; + String count = "30"; + + public GfycatRipper(URL url) throws IOException { - super(url); + super(new URL(url.toExternalForm().split("-")[0].replace("thumbs.", ""))); } @Override @@ -41,14 +49,26 @@ public class GfycatRipper extends AbstractSingleFileRipper { @Override public URL sanitizeURL(URL url) throws MalformedURLException { - url = new URL(url.toExternalForm().replace("/gifs/detail", "")); - - return url; + String sUrl = url.toExternalForm(); + sUrl = sUrl.replace("/gifs/detail", ""); + sUrl = sUrl.replace("/amp", ""); + return new URL(sUrl); + } + + public boolean isProfile() { + Pattern p = Pattern.compile("^https?://[wm.]*gfycat\\.com/@([a-zA-Z0-9]+).*$"); + Matcher m = p.matcher(url.toExternalForm()); + return m.matches(); } @Override public Document getFirstPage() throws IOException { - return Http.url(url).get(); + if (!isProfile()) { + return Http.url(url).get(); + } else { + username = getGID(url); + return Http.url(new URL("https://api.gfycat.com/v1/users/" + username + "/gfycats")).ignoreContentType().get(); + } } @Override @@ -58,27 +78,58 @@ public class GfycatRipper extends AbstractSingleFileRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://[wm.]*gfycat\\.com/([a-zA-Z0-9]+).*$"); + Pattern p = Pattern.compile("^https?://(thumbs\\.|[wm\\.]*)gfycat\\.com/@?([a-zA-Z0-9]+).*$"); Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } + + if (m.matches()) + return m.group(2); throw new MalformedURLException( - "Expected gfycat.com format:" - + "gfycat.com/id" + "Expected gfycat.com format: " + + "gfycat.com/id or " + + "thumbs.gfycat.com/id.gif" + " Got: " + url); } + private String stripHTMLTags(String t) { + t = t.replaceAll("\n" + + " \n" + + " ", ""); + t = t.replaceAll("\n" + + "", ""); + t = t.replaceAll("\n", ""); + t = t.replaceAll("=\"\"", ""); + return t; + } + + @Override + public Document getNextPage(Document doc) throws IOException { + if (cursor.equals("")) { + throw new IOException("No more pages"); + } + return Http.url(new URL("https://api.gfycat.com/v1/users/" + username + "/gfycats?count=" + count + "&cursor=" + cursor)).ignoreContentType().get(); + } + @Override public List getURLsFromPage(Document doc) { List result = new ArrayList<>(); - Elements videos = doc.select("source"); - String vidUrl = videos.first().attr("src"); - if (vidUrl.startsWith("//")) { - vidUrl = "http:" + vidUrl; + if (isProfile()) { + JSONObject page = new JSONObject(stripHTMLTags(doc.html())); + JSONArray content = page.getJSONArray("gfycats"); + for (int i = 0; i < content.length(); i++) { + result.add(content.getJSONObject(i).getString("mp4Url")); + } + cursor = page.getString("cursor"); + } else { + Elements videos = doc.select("script"); + for (Element el : videos) { + String json = el.html(); + if (json.startsWith("{")) { + JSONObject page = new JSONObject(json); + result.add(page.getJSONObject("video").getString("contentUrl")); + } + } } - result.add(vidUrl); return result; } @@ -95,14 +146,14 @@ public class GfycatRipper extends AbstractSingleFileRipper { url = new URL(url.toExternalForm().replace("/gifs/detail", "")); Document doc = Http.url(url).get(); - Elements videos = doc.select("source"); - if (videos.isEmpty()) { - throw new IOException("Could not find source at " + url); + Elements videos = doc.select("script"); + for (Element el : videos) { + String json = el.html(); + if (json.startsWith("{")) { + JSONObject page = new JSONObject(json); + return page.getJSONObject("video").getString("contentUrl"); + } } - String vidUrl = videos.first().attr("src"); - if (vidUrl.startsWith("//")) { - vidUrl = "http:" + vidUrl; - } - return vidUrl; + throw new IOException(); } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 57c3e6ab..f6c85591 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -1,480 +1,502 @@ package com.rarchives.ripme.ripper.rippers; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; -import java.net.MalformedURLException; -import java.net.URL; -import java.net.URLConnection; -import java.time.*; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.security.*; - -import org.json.JSONArray; -import org.json.JSONException; -import org.json.JSONObject; - import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.utils.Http; - +import com.rarchives.ripme.utils.Utils; +import jdk.nashorn.internal.ir.Block; +import jdk.nashorn.internal.ir.CallNode; +import jdk.nashorn.internal.ir.ExpressionStatement; +import jdk.nashorn.internal.ir.FunctionNode; +import jdk.nashorn.internal.ir.Statement; +import jdk.nashorn.internal.parser.Parser; +import jdk.nashorn.internal.runtime.Context; +import jdk.nashorn.internal.runtime.ErrorManager; +import jdk.nashorn.internal.runtime.Source; +import jdk.nashorn.internal.runtime.options.Options; +import org.json.JSONArray; +import org.json.JSONObject; import org.jsoup.Connection; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import com.rarchives.ripme.ui.RipStatusMessage; -import com.rarchives.ripme.utils.Utils; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Spliterators; +import java.util.function.BiFunction; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import static java.lang.String.format; +// Available configuration options: +// instagram.download_images_only - use to skip video links +// instagram.session_id - should be set for stories and private accounts (look for sessionid cookie) public class InstagramRipper extends AbstractJSONRipper { - String nextPageID = ""; + private String qHash; - private boolean rippingTag = false; - private String tagName; + private Map cookies = new HashMap<>(); + private String idString; + private List itemPrefixes = new ArrayList<>(); + private List failedItems = new ArrayList<>(); - private String userID; - private String rhx_gis = null; - private String csrftoken; - // Run into a weird issue with Jsoup cutting some json pages in half, this is a work around - // see https://github.com/RipMeApp/ripme/issues/601 - private String workAroundJsonString; + private boolean hashtagRip; + private boolean taggedRip; + private boolean igtvRip; + private boolean postRip; + private boolean storiesRip; + private boolean pinnedRip; + private boolean pinnedReelRip; + private enum UrlTypePattern { + // e.g. https://www.instagram.com/explore/tags/rachelc00k/ + HASHTAG("explore/tags/(?[^?/]+)"), + // e.g. https://www.instagram.com/stories/rachelc00k/ + STORIES("stories/(?[^?/]+)"), + + // e.g. https://www.instagram.com/rachelc00k/tagged/ + USER_TAGGED("(?[^?/]+)/tagged"), + + // e.g. https://www.instagram.com/rachelc00k/channel/ + IGTV("(?[^?/]+)/channel"), + + // e.g. https://www.instagram.com/p/Bu4CEfbhNk4/ + SINGLE_POST("(?:p|tv)/(?[^?/]+)"), + + // pseudo-url, e.g. https://www.instagram.com/rachelc00k/?pinned + PINNED("(?[^?/]+)/?[?]pinned"), + + // e.g. https://www.instagram.com/rachelc00k/ + USER_PROFILE("(?[^?/]+)"); + + private final String urlTypePattern; + + UrlTypePattern(String urlTypePattern) { + this.urlTypePattern = urlTypePattern; + } + } public InstagramRipper(URL url) throws IOException { super(url); } @Override - public String getHost() { - return "instagram"; - } - @Override - public String getDomain() { + protected String getDomain() { return "instagram.com"; } @Override - public boolean canRip(URL url) { - return (url.getHost().endsWith("instagram.com")); - } - - @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - URL san_url = new URL(url.toExternalForm().replaceAll("\\?hl=\\S*", "")); - LOGGER.info("sanitized URL is " + san_url.toExternalForm()); - return san_url; - } - - @Override - public String normalizeUrl(String url) { - // Remove the date sig from the url - return url.replaceAll("/[A-Z0-9]{8}/", "/"); - } - - @Override public boolean hasASAPRipping() { - return true; - } - - private List getPostsFromSinglePage(JSONObject json) { - List imageURLs = new ArrayList<>(); - JSONArray datas; - if (json.getJSONObject("entry_data").getJSONArray("PostPage") - .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media") - .has("edge_sidecar_to_children")) { - datas = json.getJSONObject("entry_data").getJSONArray("PostPage") - .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media") - .getJSONObject("edge_sidecar_to_children").getJSONArray("edges"); - for (int i = 0; i < datas.length(); i++) { - JSONObject data = (JSONObject) datas.get(i); - data = data.getJSONObject("node"); - if (data.has("is_video") && data.getBoolean("is_video")) { - imageURLs.add(data.getString("video_url")); - } else { - imageURLs.add(data.getString("display_url")); - } - } - } else { - JSONObject data = json.getJSONObject("entry_data").getJSONArray("PostPage") - .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media"); - if (data.getBoolean("is_video")) { - imageURLs.add(data.getString("video_url")); - } else { - imageURLs.add(data.getString("display_url")); - } - } - return imageURLs; + public String getHost() { + return "instagram"; } @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?(?:\\?hl=\\S*)?/?"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/\\?taken-by=([^/]+)/?"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(2) + "_" + m.group(1); - } - - p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?(?:\\?hl=\\S*)?/?"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - p = Pattern.compile("^https?://www.instagram.com/explore/tags/([^/]+)/?"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - rippingTag = true; - tagName = m.group(1); - return m.group(1); - } - - throw new MalformedURLException("Unable to find user in " + url); - } - - private String stripHTMLTags(String t) { - t = t.replaceAll("\n" + - " \n" + - " ", ""); - t.replaceAll("\n" + - "", ""); - t = t.replaceAll("\n", ""); - t = t.replaceAll("=\"\"", ""); - return t; - } - - - private JSONObject getJSONFromPage(Document firstPage) throws IOException { - // Check if this page is HTML + JSON or jsut json - if (!firstPage.html().contains("window._sharedData =")) { - return new JSONObject(stripHTMLTags(firstPage.html())); - } - String jsonText = ""; - try { - for (Element script : firstPage.select("script[type=text/javascript]")) { - if (script.data().contains("window._sharedData = ")) { - jsonText = script.data().replaceAll("window._sharedData = ", ""); - jsonText = jsonText.replaceAll("};", "}"); + for (UrlTypePattern urlType : UrlTypePattern.values()) { + Matcher urlMatcher = getUrlMatcher(url, urlType); + if (urlMatcher.matches()) { + switch (urlType) { + case HASHTAG: + hashtagRip = true; + return "tag_" + urlMatcher.group("tagname"); + case PINNED: + pinnedRip = true; + return urlMatcher.group("username") + "_pinned"; + case STORIES: + storiesRip = true; + return urlMatcher.group("username") + "_stories"; + case USER_TAGGED: + taggedRip = true; + return urlMatcher.group("username") + "_tagged"; + case IGTV: + igtvRip = true; + return urlMatcher.group("username") + "_igtv"; + case SINGLE_POST: + postRip = true; + return "post_" + urlMatcher.group("shortcode"); + case USER_PROFILE: + return urlMatcher.group("username"); + default: + throw new RuntimeException("Reached unreachable"); } } - return new JSONObject(jsonText); - } catch (JSONException e) { - throw new IOException("Could not get JSON from page"); } + throw new MalformedURLException("This URL can't be ripped"); + } + + private Matcher getUrlMatcher(URL url, UrlTypePattern type) { + String baseRegex = "^https?://(?:www[.])?instagram[.]com/%s(?:[?/].*)?"; + Pattern pattern = Pattern.compile(format(baseRegex, type.urlTypePattern)); + return pattern.matcher(url.toExternalForm()); } @Override public JSONObject getFirstPage() throws IOException { - Connection.Response resp = Http.url(url).response(); - LOGGER.info(resp.cookies()); - csrftoken = resp.cookie("csrftoken"); - Document p = resp.parse(); - // Get the query hash so we can download the next page - qHash = getQHash(p); - return getJSONFromPage(p); + setAuthCookie(); + Document document = Http.url(url).cookies(cookies).response().parse(); + qHash = getQhash(document); + JSONObject jsonObject = getJsonObjectFromDoc(document); + String hashtagNamePath = "entry_data.TagPage[0].graphql.hashtag.name"; + String singlePostIdPath = "graphql.shortcode_media.shortcode"; + String profileIdPath = "entry_data.ProfilePage[0].graphql.user.id"; + String storiesPath = "entry_data.StoriesPage[0].user.id"; + String idPath = hashtagRip ? hashtagNamePath : storiesRip ? storiesPath : postRip ? singlePostIdPath : profileIdPath; + idString = getJsonStringByPath(jsonObject, idPath); + return taggedRip ? getNextPage(null) : pinnedRip ? getPinnedItems(document) : storiesRip ? getStoriesItems() : jsonObject; } - private String getVideoFromPage(String videoID) { + private void setAuthCookie() throws IOException { + String sessionId = Utils.getConfigString("instagram.session_id", null); + if ((storiesRip || pinnedRip) && sessionId == null) { + throw new IOException("instagram.session_id should be set up for Instagram stories"); + } + if (sessionId != null) { + cookies.put("sessionid", sessionId); + } + } + + // Query hash is used for graphql requests + private String getQhash(Document doc) throws IOException { + if (postRip) { + return null; + } + Predicate hrefFilter = (storiesRip || pinnedReelRip) ? href -> href.contains("Consumer.js") : + href -> href.contains("ProfilePageContainer.js") || href.contains("TagPageContainer.js"); + + String href = doc.select("link[rel=preload]").stream() + .map(link -> link.attr("href")) + .filter(hrefFilter) + .findFirst().orElse(""); + String body = Http.url("https://www.instagram.com" + href).cookies(cookies).response().body(); + + Function hashExtractor = + storiesRip || pinnedReelRip ? this::getStoriesHash : + pinnedRip ? this::getPinnedHash : hashtagRip ? this::getTagHash : + taggedRip ? this::getUserTagHash : this::getProfileHash; + + return hashExtractor.apply(body); + } + + private String getStoriesHash(String jsData) { + return getHashValue(jsData, "loadStoryViewers", -5); + } + + private String getProfileHash(String jsData) { + return getHashValue(jsData, "loadProfilePageExtras", -1); + } + + private String getPinnedHash(String jsData) { + return getHashValue(jsData, "loadProfilePageExtras", -2); + } + + private String getTagHash(String jsData) { + return getHashValue(jsData, "requestNextTagMedia", -1); + } + + private String getUserTagHash(String jsData) { + return getHashValue(jsData, "requestNextTaggedPosts", -1); + } + + private JSONObject getJsonObjectFromDoc(Document document) { + for (Element script : document.select("script[type=text/javascript]")) { + String scriptText = script.data(); + if (scriptText.startsWith("window._sharedData") || scriptText.startsWith("window.__additionalDataLoaded")) { + String jsonText = scriptText.replaceAll("[^{]*([{].*})[^}]*", "$1"); + if (jsonText.contains("graphql") || jsonText.contains("StoriesPage")) { + return new JSONObject(jsonText); + } + } + } + return null; + } + + @Override + public JSONObject getNextPage(JSONObject source) throws IOException { + if (postRip || storiesRip || pinnedReelRip) { + return null; + } + JSONObject nextPageQuery = new JSONObject().put(hashtagRip ? "tag_name" : "id", idString).put("first", 12); + if (source == null) { + return graphqlRequest(nextPageQuery); + } + JSONObject pageInfo = getMediaRoot(source).getJSONObject("page_info"); + if (pageInfo.getBoolean("has_next_page")) { + return graphqlRequest(nextPageQuery.put("after", pageInfo.getString("end_cursor"))); + } else { + failedItems.forEach(LOGGER::error); + return null; + } + } + + private JSONObject getStoriesItems() throws IOException { + return graphqlRequest(new JSONObject().append("reel_ids", idString).put("precomposed_overlay", false)); + } + + // Two requests with different query hashes required for pinned items. + // Query hash to be used depends on flag specified: + // pinnedRip flag is used initially to get list of pinned albums; + // pinnedReelRip flag is used next to get media urls. + private JSONObject getPinnedItems(Document document) throws IOException { + JSONObject queryForIds = new JSONObject().put("user_id", idString).put("include_highlight_reels", true); + JSONObject pinnedIdsJson = graphqlRequest(queryForIds); + JSONArray pinnedItems = getJsonArrayByPath(pinnedIdsJson, "data.user.edge_highlight_reels.edges"); + pinnedRip = false; + pinnedReelRip = true; + qHash = getQhash(document); + JSONObject queryForDetails = new JSONObject(); + getStreamOfJsonArray(pinnedItems) + .map(object -> getJsonStringByPath(object, "node.id")) + .forEach(id -> queryForDetails.append("highlight_reel_ids", id)); + queryForDetails.put("precomposed_overlay", false); + return graphqlRequest(queryForDetails); + } + + private JSONObject graphqlRequest(JSONObject vars) throws IOException { + // Sleep for a while to avoid a ban + sleep(2500); + String url = format("https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s", qHash, vars.toString()); + return Http.url(url).cookies(cookies).getJSON(); + } + + @Override + public List getURLsFromJSON(JSONObject json) { + if (storiesRip || pinnedReelRip) { + JSONArray storyAlbums = getJsonArrayByPath(json, "data.reels_media"); + return getStreamOfJsonArray(storyAlbums) + .flatMap(album -> getStreamOfJsonArray(album.getJSONArray("items"))) + .peek(storyItem -> itemPrefixes.add(getTimestampPrefix(storyItem))) + .flatMap(this::parseStoryItemForUrls) + .collect(Collectors.toList()); + } + if (postRip) { + JSONObject detailsJson = downloadItemDetailsJson(idString); + addPrefixInfo(detailsJson); + return parseItemDetailsForUrls(detailsJson).collect(Collectors.toList()); + } + JSONArray edges = getMediaRoot(json).getJSONArray("edges"); + return getStreamOfJsonArray(edges) + .map(edge -> getJsonStringByPath(edge, "node.shortcode")) + .map(this::downloadItemDetailsJson) + .filter(Objects::nonNull) + .peek(this::addPrefixInfo) + .flatMap(this::parseItemDetailsForUrls) + .collect(Collectors.toList()); + } + + private Stream parseStoryItemForUrls(JSONObject storyItem) { + if (storyItem.getBoolean("is_video")) { + itemPrefixes.add(getTimestampPrefix(storyItem) + "preview_"); + int lastIndex = storyItem.getJSONArray("video_resources").length() - 1; + return Stream.of( + getJsonStringByPath(storyItem, "video_resources[" + lastIndex + "].src"), + storyItem.getString("display_url")); + } + return Stream.of(storyItem.getString("display_url")); + } + + private JSONObject getMediaRoot(JSONObject json) { + String userExtra = "data.user.edge_owner_to_timeline_media"; + String igtvExtra = "data.user.edge_felix_video_timeline"; + String taggedExtra = "data.user.edge_user_to_photos_of_you"; + String hashtagExtra = "data.hashtag.edge_hashtag_to_media"; + String userHomeRoot = "entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media"; + String igtvHomeRoot = "entry_data.ProfilePage[0].graphql.user.edge_felix_video_timeline"; + String hashtagHomeRoot = "entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media"; + String mediaRootPath = json.optJSONObject("entry_data") != null ? + (hashtagRip ? hashtagHomeRoot : igtvRip ? igtvHomeRoot : userHomeRoot) : hashtagRip ? + hashtagExtra : igtvRip ? igtvExtra : taggedRip ? taggedExtra : userExtra; + return getJsonObjectByPath(json, mediaRootPath); + } + + private JSONObject downloadItemDetailsJson(String shortcode) { + String url = "https://www.instagram.com/p/%s/?__a=1"; try { - Document doc = Http.url("https://www.instagram.com/p/" + videoID).get(); + Http http = Http.url(format(url, shortcode)); + http.ignoreContentType(); + http.connection().followRedirects(false); + Connection.Response response = http.cookies(cookies).response(); + // Fix for redirection link; repeat request with the new shortcode + if (response.statusCode() == 302) { + Pattern redirectIdPattern = Pattern.compile("/p/(?[^?/]+)"); + Matcher m = redirectIdPattern.matcher(response.header("location")); + return m.find() ? downloadItemDetailsJson(m.group("shortcode")) : null; + } + return new JSONObject(response.body()); + } catch (Exception e) { + failedItems.add(shortcode); + LOGGER.trace(format("No item %s found", shortcode), e); + } + return null; + } + + private void addPrefixInfo(JSONObject itemDetailsJson) { + JSONObject mediaItem = getJsonObjectByPath(itemDetailsJson, "graphql.shortcode_media"); + String shortcode = mediaItem.getString("shortcode"); + int subItemsCount = "GraphSidecar".equals(mediaItem.getString("__typename")) ? + getJsonArrayByPath(mediaItem, "edge_sidecar_to_children.edges").length() : 1; + for (int i = 0; i < subItemsCount; i++) { + itemPrefixes.add(getTimestampPrefix(mediaItem) + shortcode + "_"); + } + } + + private String getTimestampPrefix(JSONObject item) { + Instant instant = Instant.ofEpochSecond(item.getLong("taken_at_timestamp")); + return DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC)); + } + + private Stream parseItemDetailsForUrls(JSONObject itemDetailsJson) { + JSONObject mediaItem = getJsonObjectByPath(itemDetailsJson, "graphql.shortcode_media"); + // For some reason JSON video_url has lower quality than the HTML-tag one + // HTML-tag url is requested here and marked with _extra_ prefix + if ("GraphVideo".equals(mediaItem.getString("__typename"))) { + String shortcode = mediaItem.getString("shortcode"); + String urlFromPage = getVideoUrlFromPage(shortcode); + if (!urlFromPage.isEmpty()) { + itemPrefixes.add(getTimestampPrefix(mediaItem) + shortcode + "_extra_"); + return Stream.of(mediaItem.getString("video_url"), urlFromPage); + } + } + return parseRootForUrls(mediaItem); + } + + // Uses recursion for GraphSidecar + private Stream parseRootForUrls(JSONObject mediaItem) { + String typeName = mediaItem.getString("__typename"); + switch (typeName) { + case "GraphImage": + return Stream.of(mediaItem.getString("display_url")); + case "GraphVideo": + return Stream.of(mediaItem.getString("video_url")); + case "GraphSidecar": + JSONArray sideCar = getJsonArrayByPath(mediaItem, "edge_sidecar_to_children.edges"); + return getStreamOfJsonArray(sideCar).map(object -> object.getJSONObject("node")) + .flatMap(this::parseRootForUrls); + default: + return Stream.empty(); + } + } + + private String getVideoUrlFromPage(String videoID) { + try { + Document doc = Http.url("https://www.instagram.com/p/" + videoID).cookies(cookies).get(); return doc.select("meta[property=og:video]").attr("content"); - } catch (IOException e) { + } catch (Exception e) { LOGGER.warn("Unable to get page " + "https://www.instagram.com/p/" + videoID); } return ""; } - private String getOriginalUrl(String imageURL) { - // Without this regex most images will return a 403 error - imageURL = imageURL.replaceAll("vp/[a-zA-Z0-9]*/", ""); - imageURL = imageURL.replaceAll("scontent.cdninstagram.com/hphotos-", "igcdn-photos-d-a.akamaihd.net/hphotos-ak-"); - - // Instagram returns cropped images to unauthenticated applications to maintain legacy support. - // To retrieve the uncropped image, remove this segment from the URL. - // Segment format: cX.Y.W.H - eg: c0.134.1080.1080 - imageURL = imageURL.replaceAll("/c\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}", ""); - imageURL = imageURL.replaceAll("\\?ig_cache_key.+$", ""); - return imageURL; - } - - public String getAfter(JSONObject json) { - try { - return json.getJSONObject("entry_data").getJSONArray("ProfilePage").getJSONObject(0) - .getJSONObject("graphql").getJSONObject("user") - .getJSONObject("edge_owner_to_timeline_media").getJSONObject("page_info").getString("end_cursor"); - } catch (JSONException e) { - // This is here so that when the user rips the last page they don't get a "end_cursor not a string" error - try { - return json.getJSONObject("data").getJSONObject("user") - .getJSONObject("edge_owner_to_timeline_media").getJSONObject("page_info").getString("end_cursor"); - } catch (JSONException t) { - return ""; - } - } - } - @Override - public List getURLsFromJSON(JSONObject json) { - List imageURLs = new ArrayList<>(); - if (!url.toExternalForm().contains("/p/")) { - nextPageID = getAfter(json); + protected void downloadURL(URL url, int index) { + if (Utils.getConfigBoolean("instagram.download_images_only", false) && url.toString().contains(".mp4?")) { + LOGGER.info("Skipped video url: " + url); + return; } - - // get the rhx_gis value so we can get the next page later on - if (rhx_gis == null) { - rhx_gis = json.getString("rhx_gis"); - } - if (!url.toExternalForm().contains("/p/")) { - JSONArray datas = new JSONArray(); - if (!rippingTag) { - // This first try only works on data from the first page - try { - JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); - userID = profilePage.getJSONObject(0).getString("logging_page_id").replaceAll("profilePage_", ""); - datas = json.getJSONObject("entry_data").getJSONArray("ProfilePage").getJSONObject(0) - .getJSONObject("graphql").getJSONObject("user") - .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); - } catch (JSONException e) { - datas = json.getJSONObject("data").getJSONObject("user") - .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); - } - } else { - try { - JSONArray tagPage = json.getJSONObject("entry_data").getJSONArray("TagPage"); - datas = tagPage.getJSONObject(0).getJSONObject("graphql").getJSONObject("hashtag") - .getJSONObject("edge_hashtag_to_media").getJSONArray("edges"); - } catch (JSONException e) { - datas = json.getJSONObject("data").getJSONObject("hashtag").getJSONObject("edge_hashtag_to_media") - .getJSONArray("edges"); - } - } - for (int i = 0; i < datas.length(); i++) { - JSONObject data = (JSONObject) datas.get(i); - data = data.getJSONObject("node"); - Long epoch = data.getLong("taken_at_timestamp"); - Instant instant = Instant.ofEpochSecond(epoch); - String image_date = DateTimeFormatter.ofPattern("yyyy_MM_dd_hh:mm_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC)); - // It looks like tag pages don't have the __typename key - if (!rippingTag) { - if (data.getString("__typename").equals("GraphSidecar")) { - try { - Document slideShowDoc = Http.url(new URL("https://www.instagram.com/p/" + data.getString("shortcode"))).get(); - List toAdd = getPostsFromSinglePage(getJSONFromPage(slideShowDoc)); - for (int slideShowInt = 0; slideShowInt < toAdd.size(); slideShowInt++) { - addURLToDownload(new URL(toAdd.get(slideShowInt)), image_date + data.getString("shortcode")); - } - } catch (MalformedURLException e) { - LOGGER.error("Unable to download slide show, URL was malformed"); - } catch (IOException e) { - LOGGER.error("Unable to download slide show"); - } - } - } - try { - if (!data.getBoolean("is_video")) { - if (imageURLs.isEmpty()) { - // We add this one item to the array because either wise - // the ripper will error out because we returned an empty array - imageURLs.add(getOriginalUrl(data.getString("display_url"))); - } - addURLToDownload(new URL(data.getString("display_url")), image_date); - } else { - if (!Utils.getConfigBoolean("instagram.download_images_only", false)) { - addURLToDownload(new URL(getVideoFromPage(data.getString("shortcode"))), image_date); - } else { - sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "Skipping video " + data.getString("shortcode")); - } - } - } catch (MalformedURLException e) { - LOGGER.info("Got MalformedURLException"); - return imageURLs; - } - - if (isThisATest()) { - break; - } - } - - } else { // We're ripping from a single page - LOGGER.info("Ripping from single page"); - imageURLs = getPostsFromSinglePage(json); - } - - return imageURLs; + addURLToDownload(url, itemPrefixes.get(index - 1), "", null, cookies); } - private String getIGGis(String variables) { - String stringToMD5 = rhx_gis + ":" + variables; - LOGGER.debug("String to md5 is \"" + stringToMD5 + "\""); - try { - byte[] bytesOfMessage = stringToMD5.getBytes("UTF-8"); - - MessageDigest md = MessageDigest.getInstance("MD5"); - byte[] hash = md.digest(bytesOfMessage); - StringBuffer sb = new StringBuffer(); - for (int i = 0; i < hash.length; ++i) { - sb.append(Integer.toHexString((hash[i] & 0xFF) | 0x100).substring(1,3)); - } - return sb.toString(); - } catch(UnsupportedEncodingException e) { - return null; - } catch(NoSuchAlgorithmException e) { - return null; - } + // Javascript parsing + /* ------------------------------------------------------------------------------------------------------- */ + private String getHashValue(String javaScriptData, String keyword, int offset) { + List statements = getJsBodyBlock(javaScriptData).getStatements(); + return statements.stream() + .flatMap(statement -> filterItems(statement, ExpressionStatement.class)) + .map(ExpressionStatement::getExpression) + .flatMap(expression -> filterItems(expression, CallNode.class)) + .map(CallNode::getArgs) + .map(expressions -> expressions.get(0)) + .flatMap(expression -> filterItems(expression, FunctionNode.class)) + .map(FunctionNode::getBody) + .map(Block::getStatements) + .map(statementList -> lookForHash(statementList, keyword, offset)) + .filter(Objects::nonNull) + .findFirst().orElse(null); } - @Override - public JSONObject getNextPage(JSONObject json) throws IOException { - JSONObject toreturn; - java.util.Map cookies = new HashMap(); -// This shouldn't be hardcoded and will break one day - cookies.put("ig_pr", "1"); - cookies.put("csrftoken", csrftoken); - if (!nextPageID.equals("") && !isThisATest()) { - if (rippingTag) { - try { - sleep(2500); - String vars = "{\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}"; - String ig_gis = getIGGis(vars); - toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash + - "&variables=" + vars, ig_gis); - // Sleep for a while to avoid a ban - LOGGER.info(toreturn); - if (!pageHasImages(toreturn)) { - throw new IOException("No more pages"); - } - return toreturn; - - } catch (IOException e) { - throw new IOException("No more pages"); - } - - } - try { - // Sleep for a while to avoid a ban - sleep(2500); - String vars = "{\"id\":\"" + userID + "\",\"first\":12,\"after\":\"" + nextPageID + "\"}"; - String ig_gis = getIGGis(vars); - LOGGER.info(ig_gis); - - LOGGER.info("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars); - toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars, ig_gis); - if (!pageHasImages(toreturn)) { - throw new IOException("No more pages"); - } - return toreturn; - } catch (IOException e) { - return null; - } - } else { - throw new IOException("No more pages"); - } - } - - @Override - public void downloadURL(URL url, int index) { - addURLToDownload(url); - } - - private boolean pageHasImages(JSONObject json) { - LOGGER.info(json); - int numberOfImages = json.getJSONObject("data").getJSONObject("user") - .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); - if (numberOfImages == 0) { - return false; - } - return true; - } - - private JSONObject getPage(String url, String ig_gis) { - StringBuilder sb = new StringBuilder(); - try { - // We can't use Jsoup here because it won't download a non-html file larger than a MB - // even if you set maxBodySize to 0 - URLConnection connection = new URL(url).openConnection(); - connection.setRequestProperty("User-Agent", USER_AGENT); - connection.setRequestProperty("x-instagram-gis", ig_gis); - BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); - String line; - while ((line = in.readLine()) != null) { - sb.append(line); - - } - in.close(); - workAroundJsonString = sb.toString(); - return new JSONObject(sb.toString()); - - } catch (MalformedURLException e) { - LOGGER.info("Unable to get query_hash, " + url + " is a malformed URL"); - return null; - } catch (IOException e) { - LOGGER.info("Unable to get query_hash"); - LOGGER.info(e.getMessage()); - return null; - } - } - - private String getQHash(Document doc) { - String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); - StringBuilder sb = new StringBuilder(); - Document jsPage; - try { - // We can't use Jsoup here because it won't download a non-html file larger than a MB - // even if you set maxBodySize to 0 - URLConnection connection = new URL(jsFileURL).openConnection(); - BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); - String line; - while ((line = in.readLine()) != null) { - sb.append(line); - } - in.close(); - - } catch (MalformedURLException e) { - LOGGER.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL"); - return null; - } catch (IOException e) { - LOGGER.info("Unable to get query_hash"); - LOGGER.info(e.getMessage()); - return null; - } - if (!rippingTag) { - Pattern jsP = Pattern.compile("byUserId\\.get\\(t\\)\\)\\|\\|void 0===r\\?void 0:r\\.pagination},queryId:.([a-zA-Z0-9]+)"); - Matcher m = jsP.matcher(sb.toString()); - if (m.find()) { - return m.group(1); - } - - } else { - Pattern jsP = Pattern.compile("return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:.([a-zA-Z0-9]+)."); - Matcher m = jsP.matcher(sb.toString()); - if (m.find()) { - return m.group(1); + private String lookForHash(List list, String keyword, int offset) { + for (int i = 0; i < list.size(); i++) { + Statement st = list.get(i); + if (st.toString().contains(keyword)) { + return list.get(i + offset).toString().replaceAll(".*\"([0-9a-f]*)\".*", "$1"); } } - LOGGER.error("Could not find query_hash on " + jsFileURL); return null; - } + private Stream filterItems(Object obj, Class aClass) { + return Stream.of(obj).filter(aClass::isInstance).map(aClass::cast); + } + + private Block getJsBodyBlock(String javaScriptData) { + ErrorManager errors = new ErrorManager(); + Context context = new Context(new Options("nashorn"), errors, Thread.currentThread().getContextClassLoader()); + return new Parser(context.getEnv(), Source.sourceFor("name", javaScriptData), errors).parse().getBody(); + } + + // Some JSON helper methods below + /* ------------------------------------------------------------------------------------------------------- */ + private JSONObject getJsonObjectByPath(JSONObject object, String key) { + Pattern arrayPattern = Pattern.compile("(?.*)\\[(?\\d+)]"); + JSONObject result = object; + for (String s : key.split("[.]")) { + Matcher m = arrayPattern.matcher(s); + result = m.matches() ? + result.getJSONArray(m.group("arr")).getJSONObject(Integer.parseInt(m.group("idx"))) : + result.getJSONObject(s); + } + return result; + } + + private T getByPath(BiFunction func, JSONObject object, String key) { + int namePos = key.lastIndexOf('.'); + JSONObject parent = namePos < 0 ? object : getJsonObjectByPath(object, key.substring(0, namePos)); + return func.apply(parent, key.substring(namePos + 1)); + } + + private JSONArray getJsonArrayByPath(JSONObject object, String key) { + return getByPath(JSONObject::getJSONArray, object, key); + } + + private String getJsonStringByPath(JSONObject object, String key) { + return getByPath(JSONObject::getString, object, key); + } + + private Stream getStreamOfJsonArray(JSONArray array) { + return StreamSupport.stream(new JSONSpliterator(array), false); + } + + private class JSONSpliterator extends Spliterators.AbstractSpliterator { + private JSONArray array; + private int index = 0; + + JSONSpliterator(JSONArray array) { + super(array.length(), SIZED | ORDERED); + this.array = array; + } + + @Override + public boolean tryAdvance(Consumer action) { + if (index == array.length()) { + return false; + } + action.accept(array.getJSONObject(index++)); + return true; + } + } } diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 1f49b8a5..338c8f27 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -12,6 +12,7 @@ import javax.sound.sampled.Clip; import javax.sound.sampled.Line; import javax.sound.sampled.LineEvent; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; @@ -179,21 +180,21 @@ public class Utils { /** * Determines if your current system is a Windows system. */ - private static boolean isWindows() { + public static boolean isWindows() { return OS.contains("win"); } /** * Determines if your current system is a Mac system */ - private static boolean isMacOS() { + public static boolean isMacOS() { return OS.contains("mac"); } /** * Determines if current system is based on UNIX */ - private static boolean isUnix() { + public static boolean isUnix() { return OS.contains("nix") || OS.contains("nux") || OS.contains("bsd"); } @@ -773,4 +774,34 @@ public class Utils { return false; } + public static File shortenSaveAsWindows(String ripsDirPath, String fileName) throws FileNotFoundException { + // int ripDirLength = ripsDirPath.length(); + // int maxFileNameLength = 260 - ripDirLength; + // LOGGER.info(maxFileNameLength); + LOGGER.error("The filename " + fileName + " is to long to be saved on this file system."); + LOGGER.info("Shortening filename"); + String fullPath = ripsDirPath + File.separator + fileName; + // How long the path without the file name is + int pathLength = ripsDirPath.length(); + int fileNameLength = fileName.length(); + if (pathLength == 260) { + // We've reached the max length, there's nothing more we can do + throw new FileNotFoundException("File path is too long for this OS"); + } + String[] saveAsSplit = fileName.split("\\."); + // Get the file extension so when we shorten the file name we don't cut off the + // file extension + String fileExt = saveAsSplit[saveAsSplit.length - 1]; + // The max limit for paths on Windows is 260 chars + LOGGER.info(fullPath.substring(0, 259 - pathLength - fileExt.length() + 1) + "." + fileExt); + fullPath = fullPath.substring(0, 259 - pathLength - fileExt.length() + 1) + "." + fileExt; + LOGGER.info(fullPath); + LOGGER.info(fullPath.length()); + return new File(fullPath); + } + + public static String sanitizeSaveAs(String fileNameToSan) { + return fileNameToSan.replaceAll("[\\\\/:*?\"<>|]", "_"); + } + } \ No newline at end of file