4 releases (2 breaking)

0.3.1 Apr 3, 2023
0.3.0 Apr 1, 2023
0.2.0 Feb 26, 2023
0.1.0 Aug 20, 2022

#9 in #robots-txt

44 downloads per month

MIT license

49KB
1K SLoC

Frangipani

The goal of this project is to create a configurable and extensible crawler framework.

Features

  • Continuous crawling
  • Concurrent crawling
  • Obey robots.txt

Usage

use async_trait::async_trait;
use frangipani::{Response, Spider};
use frangipani::util::join_url;
use scraper::{Html, Selector};

pub struct DexcodeSpider {
}

#[async_trait]
impl Spider for DexcodeSpider {
    fn name(&self) -> String {
        "dexcode-spider".to_owned()
    }
    
    fn start_urls(&self) -> Vec<String> {
        vec![
            "https://dexcode.com/".to_owned(),
        ]
    }
    
    async fn parse(&self, response: Response) -> (u64, Vec<String>) {
        if response.content_type() != "text/html" {
            return (0, vec![]);
        }

        let url = response.get_url().to_owned();
        let text = response.into_string().unwrap();
        
        let mut urls = vec![];
        {
            let document = Html::parse_document(&text);
            let link_selector = Selector::parse("a").unwrap();
            for link in document.select(&link_selector) {
                if let Some(relative_url) = link.value().attr("href") {
                    let join_url = join_url(&url, relative_url);
                    let req_url = reqwest::Url::parse(&join_url).unwrap();
                    if req_url.scheme() != "http" && req_url.scheme() != "https" {
                        continue;
                    }
                    if req_url.domain().unwrap().ends_with("dexcode.com") {
                        // Only push url with `dexcode.com` domain
                        urls.push(req_url.to_string());
                    }
                }
            }

            let title_selector = Selector::parse("title").unwrap();
            let title = match document.select(&title_selector).next() {
                Some(el) => el.inner_html(),
                None => "".to_owned(),
            };
            println!("{},{}", url, title);
        }
        
        (1, urls)
    }
}

#[tokio::main]
async fn main() {
    env_logger::init();

    let spiders: Vec<Box<dyn Spider + Send + Sync>> = vec![
        Box::new(DexcodeSpider {}),
    ];

    let mut engine = frangipani::engine(spiders);
    engine.start().await;
}

For continuous crawling, see examples/continuous.rs in the project repository.

Dependencies

~38–54MB
~1M SLoC