Skip to content

Instantly share code, notes, and snippets.

@JustSimplyKyle
Last active September 18, 2024 11:04
Show Gist options
  • Save JustSimplyKyle/5ad650d87c2e82a6ba3ece2fbf440f05 to your computer and use it in GitHub Desktop.
Save JustSimplyKyle/5ad650d87c2e82a6ba3ece2fbf440f05 to your computer and use it in GitHub Desktop.
A scraper that scrapes `manhuagui.com`, also has the ability to translate to epub
use std::io::Write;
use std::process::Command;
use std::time::Duration;
use argh::FromArgs;
use bytes::BytesMut;
use color_eyre::eyre::{bail, ContextCompat};
use fantoccini::{ClientBuilder, Locator};
use reqwest::header::{
HeaderMap, HeaderValue, ACCEPT, ACCEPT_ENCODING, ACCEPT_LANGUAGE, CONNECTION, REFERER,
USER_AGENT,
};
use reqwest::Client;
use tokio::fs::create_dir_all;
use tokio_stream::StreamExt;
#[derive(FromArgs, Clone)]
// "https://www.manhuagui.com/comic/33860/725114.html";
/// Web scraper to download manhuagui things
struct Arguments {
/// if you don't want to download, only wanting to translate already downloaded files
#[argh(option)]
only_translate_to_epub: Option<String>,
/// url
#[argh(option, short = 'i')]
url: Option<String>,
/// to launch the chrome webdriver headlessly or not
#[argh(switch)]
headless: bool,
/// port
#[argh(option)]
port: Option<i64>,
}
#[tokio::main]
async fn main() -> color_eyre::Result<()> {
color_eyre::install()?;
let args: Arguments = argh::from_env();
spawn_chromedriver(&args)?;
if let Some(x) = args.only_translate_to_epub {
to_epub(&x).await?;
} else {
if let Some(url) = args.url.clone() {
let anime = download(&args, &url).await?;
to_epub(&anime).await?;
} else {
bail!("You HAVE to provide url");
}
}
Ok(())
}
fn spawn_chromedriver(args: &Arguments) -> color_eyre::Result<()> {
let headless = if args.headless { "--headless" } else { "" };
let port = if let Some(port) = args.port {
format!("--port={}", port.to_string())
} else {
String::new()
};
Command::new(format!("chromedriver"))
.arg(headless)
.arg(port)
.spawn()?;
Ok(())
}
async fn to_epub(anime: &str) -> color_eyre::Result<()> {
let mut dir = tokio::fs::read_dir(&anime).await?;
while let Some(entry) = dir.next_entry().await? {
let chapter = entry
.path()
.to_string_lossy()
.to_string()
.chars()
.filter(|x| x.is_numeric())
.collect::<String>()
.parse::<i64>()?;
let mut sub_path = tokio::fs::read_dir(entry.path()).await?;
let mut paths = Vec::new();
while let Some(path) = sub_path.next_entry().await? {
if path.path().extension() == Some("jpg".as_ref()) {
paths.push(path.path());
}
}
paths.sort_by_key(|x| {
x.file_name()
.unwrap()
.to_string_lossy()
.to_string()
.chars()
.filter(|x| x.is_numeric())
.collect::<String>()
.parse::<i64>()
.unwrap()
});
let mut pandoc = pandoc::new();
let html_path = entry.path().join("html_pages");
create_dir_all(&html_path).await?;
for (str, path) in paths.iter().map(|x| {
let name = x
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
(
format!(
"<img src='{}' style='object-fit: contain; height: 100%; width: auto'>",
x.display(),
),
html_path.join(name),
)
}) {
dbg!(&path);
let with_html_extension = path.with_extension("html");
let mut file = std::fs::File::create(&with_html_extension)?;
pandoc.add_input(&with_html_extension);
write!(&mut file, "{str}")?;
}
let first_picture = paths[0].clone();
pandoc.add_option(pandoc::PandocOption::EpubCoverImage(first_picture));
pandoc.set_output(pandoc::OutputKind::File(
format!("{anime} 第 {chapter} 卷.epub").into(),
));
pandoc.execute()?;
}
Ok(())
}
async fn download(args: &Arguments, url: &str) -> color_eyre::Result<String> {
let mut caps = serde_json::map::Map::new();
let default = vec!["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage"];
let opts = args.headless.then_some("--headless").map_or(default, |x| {
vec![
x,
"--disable-gpu",
"--no-sandbox",
"--disable-dev-shm-usage",
]
});
let opts = serde_json::json!({
"args": opts,
});
caps.insert("goog:chromeOptions".to_string(), opts);
let driver = ClientBuilder::native()
.capabilities(caps)
.connect(&format!("http://localhost:{0}", args.port.unwrap_or(0)))
.await?;
driver.goto(url).await?;
let client = build_client().await?;
let anime = driver
.find(Locator::Css(".title"))
.await?
.find(Locator::Css("h1"))
.await?
.find(Locator::Css("a"))
.await?
.text()
.await?;
tokio::fs::create_dir_all("anime").await?;
let mut tasks = Vec::new();
for chpt in 1.. {
let last = driver.find(Locator::Css(".tip-alert")).await;
if last.is_ok() {
break;
}
tokio::fs::create_dir_all(format!("{anime}/Chapter {chpt}")).await?;
let page_count = get_page_count(&driver).await?;
for i in 1..=page_count {
let ele = driver.find(Locator::Id("mangaFile")).await?;
let image_url = ele.attr("src").await?.context("impossible")?;
let client_cloned = client.clone();
let anime_cloned = anime.clone();
let task = tokio::spawn(async move {
let bytes = access_image(&client_cloned, &image_url).await?;
tokio::fs::write(format!("{anime_cloned}/Chapter {chpt}/{i}.jpg"), bytes).await?;
Ok::<(), color_eyre::Report>(())
});
tasks.push(task);
let next = driver.find(Locator::Id("next")).await?;
next.click().await?;
}
if let Ok(x) = driver.find(Locator::Css(".pb-x")).await {
x.click().await?;
}
let chapter_btn = driver.find(Locator::Css(".nextC")).await?;
chapter_btn.click().await?;
while let Ok(_) = driver.find(Locator::Id("smh-msg-box")).await {
tokio::time::sleep(Duration::from_millis(200)).await;
}
}
for x in tasks {
x.await??;
}
Ok(anime)
}
async fn get_page_count(driver: &fantoccini::Client) -> color_eyre::Result<i64> {
let ele = driver.find(Locator::Css(".title")).await?;
let ele = ele.find_all(Locator::Css("span")).await?;
let stream = tokio_stream::iter(ele)
.then(|x| async move { x.text().await })
.filter_map(|x| x.ok())
.filter(|x| x.contains("("));
let pages = Box::pin(stream).next().await.unwrap();
let target = pages.split('/').last().unwrap();
let target = target[..target.len() - 1].parse::<i64>().unwrap();
Ok(target)
}
async fn build_client() -> color_eyre::Result<Client> {
let referrer_url = "https://www.manhuagui.com/comic/33860/459824.html";
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"));
headers.insert(REFERER, HeaderValue::from_str(referrer_url)?);
headers.insert(
ACCEPT,
HeaderValue::from_static("image/webp,image/apng,image/*,*/*;q=0.8"),
);
headers.insert(
ACCEPT_ENCODING,
HeaderValue::from_static("gzip, deflate, br"),
);
headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.9"));
headers.insert(CONNECTION, HeaderValue::from_static("keep-alive"));
headers.insert("Sec-Fetch-Dest", HeaderValue::from_static("image"));
headers.insert("Sec-Fetch-Mode", HeaderValue::from_static("no-cors"));
headers.insert("Sec-Fetch-Site", HeaderValue::from_static("cross-site"));
let client = reqwest::Client::builder()
.default_headers(headers)
.build()?;
client.get(referrer_url).send().await?;
Ok(client)
}
async fn access_image(client: &Client, image_url: &str) -> color_eyre::Result<bytes::Bytes> {
// Now try to access the image URL
let mut response = client.get(image_url).send().await?;
let status = response.status();
println!("Successfully accessed the image. Status code: {}", status);
let all = response.content_length();
let mut bytes = BytesMut::new();
while let Some(chunk) = response.chunk().await? {
if let Some(x) = all {
println!("Percentage: {}", chunk.len() as f64 / x as f64);
} else {
println!("Downloaded Bytes: {}", bytes.len());
}
bytes.extend(chunk);
}
let content = bytes.freeze();
Ok(content)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment