13 releases

0.1.93 Mar 6, 2021
0.1.92 Mar 6, 2021
0.1.91 Jan 21, 2021

#1918 in Parser implementations

MIT/Apache

49KB
1K SLoC

Capricorn

Parse html according to configuration.

Capricorn is a html parsing library that supports recursion and custom execution order.

Version info Downloads docs example branch parameter dependency status

Default execution order

vec![String::from("selects"),
    String::from("each"),
    String::from("select_params"),
    String::from("nodes"),
    String::from("has"),
    String::from("contains")];
    
selects > each > (one or all or fields) > ... text_attr_html > (text or attr or html);
selects > select_params > selects > ... text_attr_html > (text or attr or html);
selects > nodes > has > contains > text_attr_html > (text or attr or html);

Support:

Capricorn support example val type
selects element field_name:
  selects:
      - element_name
String
selects class field_name:
  selects:
      - .class_name
String
selects class element field_name:
  selects:
      - .class_name
      - element_name
String
first field_name:
  selects:
      - element_name
  nodes:
      first: true
String
last field_name:
  selects:
      - element_name
  nodes:
      last: true
String
eq field_name:
  selects:
      - element_name
  nodes:
      eq: 0
String
parent field_name:
  selects:
      - element_name
  nodes:
      parent: true
String
children field_name:
  selects:
      - element_name
  nodes:
      children: true
String
prev_sibling field_name:
  selects:
      - element_name
  nodes:
      prev_sibling: true
String
next_sibling field_name:
  selects:
      - element_name
  nodes:
      next_sibling: true
String
has_class field_name:
  selects:
      - element_name
  has:
      class: class_name
String
has_attr field_name:
  selects:
      - element_name
  has:
      attr: attr_name
String
each one field_name:
  selects:
      - element_name
  each:
      one:
          selects:
              - .class_name
          ...
String
each all field_name:
  selects:
      - element_name
  each:
      all:
          selects:
              - .class_name
          ...
Array
each fields field_name:
  selects:
      - element_name
  each:
      fields:
        field_name:
          selects:
              - .class_name
          ...
        field_name1:
          selects:
              - .class_name
          ...
Map
select_params field_name:
  selects:
      - element_name
  select_params:
      selects:
          - .class_name
      ...
...
text field_name:
  selects:
      - element_name
  text_attr_html:
      text: true
String
attr field_name:
  selects:
      - element_name
  text_attr_html:
      attr: true
String
html field_name:
  selects:
      - element_name
  text_attr_html:
      html: true
String
text contains field_name:
  selects:
      - element_name
  contains:
      contains:
          text:
              - test
String
text not contains field_name:
  selects:
      - element_name
  contains:
      not_contains:
          text:
              - test
String
html contains field_name:
  selects:
      - element_name
  contains:
      contains:
          html:
              - test
String
html not contains field_name:
  selects:
      - element_name
  contains:
      not_contains:
          html:
              - test
String
exec order field_name:
  exec_order:
      - selects
      - has
      - nodes
  selects:
      - element_name
  has:
      class: class_name
  nodes:
      first: true
String
data format splits field_name:
  selects:
      - element_name
  data_format:
      splits:
          - { key: str }
Array
data format splits field_name:
  selects:
      - element_name
  data_format:
      splits:
          - { key: str, index: 0 }
String
data format replaces field_name:
  selects:
      - element_name
  data_format:
      replaces:
          - str
String
data format deletes field_name:
  selects:
      - element_name
  data_format:
      deletes:
          - str
String
data format find field_name:
  selects:
      - element_name
  data_format:
      find:
        - regex
String
data format find_iter field_name:
  selects:
      - element_name
  data_format:
      find_iter:
        - regex
Array
Multi-version regular matching err regexes_match_parse_html:
    - regex: regex
      version: 1
      err: err_msg
Err
Multi-version regular matching fields regexes_match_parse_html:
    - regex: regex
      version: 1
      fields:
        field_name:
          selects:
          ...
        field_name:
          selects:
          ...
Map

Parse html code, more...

let yml = read_file("./test_html/test.yml").unwrap();
let params: parse::HashMapSelectParams = serde_yaml::from_str(&yml).unwrap();
let html = read_file("./test_html/test.html").unwrap();
let r = parse::parse_html(&params, &html);

Multi-version regular matching parsing html code, more...

let yml = read_file("./test_html/regexes_match_parse_html.yml").unwrap();
let v:  match_html::MatchHtmlVec = serde_yaml::from_str(&yml).unwrap();
let html = read_file("./test_html/test.html").unwrap();
let r =  v.regexes_match_parse_html(html)?;

Dependencies

~8–17MB
~195K SLoC