3 unstable releases

0.2.0	Nov 8, 2022
0.1.1	Mar 26, 2021
0.1.0	Mar 23, 2021

#24 in #xpath

Used in scraper-main

MIT license

10KB
196 lines

XPATH Scraper

Makes it easier to scrape websites with XPATH. Currently using my xpath parser which is incomplete, undocumented and used originally for teaching myself about parsing.

A Very simple example of this which is below and also in the example folder:

use std::io::Cursor;

use scraper_macros::Scraper;
use scraper_main::{
	xpather,
	ConvertFromValue,
	ScraperMain
};

#[derive(Debug, Scraper)]
pub struct RedditList(
	// Uses XPATH to find the item containers
	#[scrape(xpath = r#"//div[contains(@class, "Post") and not(contains(@class, "promotedlink"))]"#)]
	Vec<RedditListItem>
);


#[derive(Debug, Scraper)]
pub struct RedditListItem {
	// URL of the post
	#[scrape(xpath = r#".//a[@data-click-id="body"]/@href"#)]
	pub url: Option<String>,

	// Title of the post
	#[scrape(xpath = r#".//a[@data-click-id="body"]/div/h3/text()"#)]
	pub title: Option<String>,

	// When it was posted
	#[scrape(xpath = r#".//a[@data-click-id="timestamp"]/text()"#)]
	pub timestamp: Option<String>,

	// Amount of comments.
	#[scrape(xpath = r#".//a[@data-click-id="comments"]/span/text()"#)]
	pub comment_count: Option<String>,

	// Vote count.
	#[scrape(xpath = r#"./div[1]/div/div/text()"#)]
	pub votes: Option<String>,
}


#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
	// Request subreddit
	let resp = reqwest::get("https://www.reddit.com/r/nocontextpics/").await?;
	let data = resp.text().await?;

	// Parse request into a Document.
	let document = xpather::parse_doc(&mut Cursor::new(data));

	// Scrape RedditList struct.
	let list = RedditList::scrape(&document, None)?;

	// Output the scraped.
	println!("{:#?}", list);

	Ok(())
}

Dependencies

~1.5MB
~38K SLoC