#format #parser #universal #sentence #token #conllu #co-nll-u

bin+lib rs-conllu

A parser for the CoNLL-U format of the Universal Dependencies project

1 unstable release

0.1.0 Apr 22, 2023

#1910 in Parser implementations

MIT/Apache

17KB
403 lines

rs-conllu

This project aims to provide a parser for the CoNLL-U format of the Universal Dependencies project: https://universaldependencies.org/format.html.

Basic Usage

Parse a file in CoNLL-U format and iterate over the containing sentences.

let file = File::open("example.conllu").unwrap();

let doc = parse_file(file);

for sentence in doc {
    for token in sentence.unwrap() {
        println!("{}", token.form);
    }
}

Features

  • Tested on version 2.11 UD treebanks
  • Handles different types of token ids (single, range, suboordinate)

Limitations

Parsing happens in a "flat" manner, relations between tokens are not respected.


lib.rs:

A library for parsing the CoNNL-U format.

Basic Usage

Parse a sentence in CoNNL-U format and iterate over the containing Token elements. Example taken from CoNLL-U format description.

use rs_conllu::{parse_sentence, TokenID};

let s = "# sent_id = 1
## text = They buy and sell books.
1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	2:nsubj|4:nsubj	_
2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	root	0:root	_
3	and	and	CCONJ	CC	_	4	cc	4:cc	_
4	sell	sell	VERB	VBP	Number=Plur|Person=4|Tense=Pres	2	conj	0:root|2:conj	_
6	books	book	NOUN	NNS	Number=Plur	2	obj	2:obj|4:obj	SpaceAfter=No
7	.	.	PUNCT	.	_	2	punct	2:punct	_
";

let sentence = parse_sentence(s).unwrap();
let mut token_iter = sentence.into_iter();

assert_eq!(token_iter.next().unwrap().id, TokenID::Single(1));
assert_eq!(token_iter.next().unwrap().form, "buy".to_owned());

Dependencies

~1.4–9MB
~78K SLoC