4 releases (2 breaking)
0.3.1 | Apr 3, 2023 |
---|---|
0.3.0 | Apr 1, 2023 |
0.2.0 | Feb 26, 2023 |
0.1.0 | Aug 20, 2022 |
#9 in #robots-txt
44 downloads per month
49KB
1K
SLoC
Frangipani
The goal of this project is to create a configurable and extensible crawler framework.
Features
- Continuous crawling
- Concurrent crawling
- Obey robots.txt
Usage
use async_trait::async_trait;
use frangipani::{Response, Spider};
use frangipani::util::join_url;
use scraper::{Html, Selector};
pub struct DexcodeSpider {
}
#[async_trait]
impl Spider for DexcodeSpider {
fn name(&self) -> String {
"dexcode-spider".to_owned()
}
fn start_urls(&self) -> Vec<String> {
vec![
"https://dexcode.com/".to_owned(),
]
}
async fn parse(&self, response: Response) -> (u64, Vec<String>) {
if response.content_type() != "text/html" {
return (0, vec![]);
}
let url = response.get_url().to_owned();
let text = response.into_string().unwrap();
let mut urls = vec![];
{
let document = Html::parse_document(&text);
let link_selector = Selector::parse("a").unwrap();
for link in document.select(&link_selector) {
if let Some(relative_url) = link.value().attr("href") {
let join_url = join_url(&url, relative_url);
let req_url = reqwest::Url::parse(&join_url).unwrap();
if req_url.scheme() != "http" && req_url.scheme() != "https" {
continue;
}
if req_url.domain().unwrap().ends_with("dexcode.com") {
// Only push url with `dexcode.com` domain
urls.push(req_url.to_string());
}
}
}
let title_selector = Selector::parse("title").unwrap();
let title = match document.select(&title_selector).next() {
Some(el) => el.inner_html(),
None => "".to_owned(),
};
println!("{},{}", url, title);
}
(1, urls)
}
}
#[tokio::main]
async fn main() {
env_logger::init();
let spiders: Vec<Box<dyn Spider + Send + Sync>> = vec![
Box::new(DexcodeSpider {}),
];
let mut engine = frangipani::engine(spiders);
engine.start().await;
}
For continuous crawling, see examples/continuous.rs
in the project repository.
Dependencies
~38–54MB
~1M SLoC