3 unstable releases
0.2.0 | Nov 8, 2022 |
---|---|
0.1.1 | Mar 26, 2021 |
0.1.0 | Mar 23, 2021 |
#22 in #xpath
Used in scraper-main
10KB
196 lines
XPATH Scraper
Makes it easier to scrape websites with XPATH. Currently using my xpath parser which is incomplete, undocumented and used originally for teaching myself about parsing.
A Very simple example of this which is below and also in the example folder:
use std::io::Cursor;
use scraper_macros::Scraper;
use scraper_main::{
xpather,
ConvertFromValue,
ScraperMain
};
#[derive(Debug, Scraper)]
pub struct RedditList(
// Uses XPATH to find the item containers
#[scrape(xpath = r#"//div[contains(@class, "Post") and not(contains(@class, "promotedlink"))]"#)]
Vec<RedditListItem>
);
#[derive(Debug, Scraper)]
pub struct RedditListItem {
// URL of the post
#[scrape(xpath = r#".//a[@data-click-id="body"]/@href"#)]
pub url: Option<String>,
// Title of the post
#[scrape(xpath = r#".//a[@data-click-id="body"]/div/h3/text()"#)]
pub title: Option<String>,
// When it was posted
#[scrape(xpath = r#".//a[@data-click-id="timestamp"]/text()"#)]
pub timestamp: Option<String>,
// Amount of comments.
#[scrape(xpath = r#".//a[@data-click-id="comments"]/span/text()"#)]
pub comment_count: Option<String>,
// Vote count.
#[scrape(xpath = r#"./div[1]/div/div/text()"#)]
pub votes: Option<String>,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Request subreddit
let resp = reqwest::get("https://www.reddit.com/r/nocontextpics/").await?;
let data = resp.text().await?;
// Parse request into a Document.
let document = xpather::parse_doc(&mut Cursor::new(data));
// Scrape RedditList struct.
let list = RedditList::scrape(&document, None)?;
// Output the scraped.
println!("{:#?}", list);
Ok(())
}
Dependencies
~1.5MB
~38K SLoC