commit - 5cfb505c92887a2789a67f433b7030da648b5f0f
commit + fa3ca37ff9b9e995cf8dd43e6c2aea9d3c7c0f34
blob - /dev/null
blob + cb0ed49c2e95a725feac9289d5fb201519b65b2a (mode 644)
--- /dev/null
+++ eur-lex-scraper/data/tests/parsers/article/article_test_2.html
+<div class="eli-subdivision" id="art_">
+ <p id="d1e2795-1-1" class="oj-ti-art">Article 4</p>
+ <div class="eli-title" id="art_4.tit_1">
+ <p class="oj-sti-art">AI literacy</p>
+ </div>
+ <p class="oj-normal">Providers and deployers of AI systems shall take measures to ensure, to their best extent, a sufficient level of AI literacy of their staff and other persons dealing with the operation and use of AI systems on their behalf, taking into account their technical knowledge, experience, education and training and the context the AI systems are to be used in, and considering the persons or groups of persons on whom the AI systems are to be used.</p>
+</div>
blob - /dev/null
blob + 490f229466de01de985693f36feb246aa07a2cb7 (mode 644)
--- /dev/null
+++ eur-lex-scraper/data/tests/parsers/article/article_test_3.html
+<div class="eli-subdivision" id="art_4">
+ <p id="d1e2795-1-1" class="oj-ti-art">Article 4</p>
+ <p class="oj-normal">Providers and deployers of AI systems shall take measures to ensure, to their best extent, a sufficient level of AI literacy of their staff and other persons dealing with the operation and use of AI systems on their behalf, taking into account their technical knowledge, experience, education and training and the context the AI systems are to be used in, and considering the persons or groups of persons on whom the AI systems are to be used.</p>
+</div>
blob - /dev/null
blob + e4a8d06b6590a9d6f3882e876cba5666f8826026 (mode 644)
--- /dev/null
+++ eur-lex-scraper/data/tests/parsers/article/article_test_4.html
+<div class="eli-subdivision">
+ <p id="d1e2795-1-1" class="oj-ti-art">Article 4</p>
+ <div class="eli-title" id="art_4.tit_1">
+ <p class="oj-sti-art">AI literacy</p>
+ </div>
+ <p class="oj-normal">Providers and deployers of AI systems shall take measures to ensure, to their best extent, a sufficient level of AI literacy of their staff and other persons dealing with the operation and use of AI systems on their behalf, taking into account their technical knowledge, experience, education and training and the context the AI systems are to be used in, and considering the persons or groups of persons on whom the AI systems are to be used.</p>
+</div>
blob - 108137e5d89c59765d119e7824718feb26e8f348
blob + 3e3dcd2532da8ba483a7310b7e3ab65c387f68f5
--- eur-lex-scraper/src/parsers/act_title.rs
+++ eur-lex-scraper/src/parsers/act_title.rs
#[cfg(test)]
mod tests {
- use std::fs;
-
- use super::*;
use crate::parsers::act::EUActParser;
+ use std::fs;
fn get_act_html_simple() -> String {
fs::read_to_string("data/tests/parsers/act/test_act_simple.html").unwrap()
blob - fe66bc85915175e8f495cdebb223e928eb197f37
blob + d433b9cd088a8ce5c945d648a1da87d176b28850
--- eur-lex-scraper/src/parsers/article.rs
+++ eur-lex-scraper/src/parsers/article.rs
#[derive(Error, Debug, PartialEq, PartialOrd)]
pub enum ArticleParserError {
- #[error("error while parsing article")]
- GenericError,
+ #[error("error while parsing article title")]
+ TitleError,
#[error("error while parsing article number")]
ErrorNumber,
+ #[error("error while parsing article id: no id attribute")]
+ NoIdAttribute,
}
impl ArticleParser {
pub fn parse(element: ElementRef) -> Result<Article, ArticleParserError> {
let id = match element.attr("id") {
Some(id) => id.to_string(),
- None => return Err(ArticleParserError::ErrorNumber),
+ None => return Err(ArticleParserError::NoIdAttribute),
};
let number_str = id.replace("art_", "");
let number: u32 = match number_str.parse() {
let article_title_selector = Selector::parse(r#".oj-sti-art"#).unwrap();
let title = match element.select(&article_title_selector).next() {
Some(title) => title.inner_html(),
- None => "".to_string(),
+ None => return Err(ArticleParserError::TitleError),
};
let text = html2text(&element.inner_html());
let article = Article {
use scraper::{Html, Selector};
use std::fs;
+ // Well formed article
fn get_test_article_1() -> String {
fs::read_to_string("data/tests/parsers/article/article_test_1.html").unwrap()
}
+ // Article with missing number
+ fn get_test_article_2() -> String {
+ fs::read_to_string("data/tests/parsers/article/article_test_2.html").unwrap()
+ }
+ // Article with no title
+ fn get_test_article_3() -> String {
+ fs::read_to_string("data/tests/parsers/article/article_test_3.html").unwrap()
+ }
+ // Article with no id attribute
+ fn get_test_article_4() -> String {
+ fs::read_to_string("data/tests/parsers/article/article_test_4.html").unwrap()
+ }
#[test]
- fn parsing_article() {
+ fn parsing_article_1_well_formed() {
let html = Html::parse_fragment(&get_test_article_1());
let selector = Selector::parse("[id^=art_]").unwrap();
let element_ref = html.select(&selector).next().unwrap();
let article_left = ArticleParser::parse(element_ref).unwrap();
let article_right = Article {
- title: "Subject matter`".to_string(),
+ title: "AI literacy".to_string(),
number: 4,
text: html2text(&element_ref.inner_html()),
};
assert_eq!(article_left, article_right)
}
+ #[test]
+ fn parsing_article_no_number_return_number_error() {
+ let html = Html::parse_fragment(&get_test_article_2());
+ let selector = Selector::parse("[id^=art_]").unwrap();
+ let element_ref = html.select(&selector).next().unwrap();
+ let article_left = ArticleParser::parse(element_ref).unwrap_err();
+ let article_right = ArticleParserError::ErrorNumber;
+ assert_eq!(article_left, article_right)
+ }
+ #[test]
+ fn parsing_article_no_title_return_title_error() {
+ let html = Html::parse_fragment(&get_test_article_3());
+ let selector = Selector::parse("[id^=art_]").unwrap();
+ let element_ref = html.select(&selector).next().unwrap();
+ let article_left = ArticleParser::parse(element_ref).unwrap_err();
+ let article_right = ArticleParserError::TitleError;
+ assert_eq!(article_left, article_right)
+ }
+ #[test]
+ fn parsing_article_no_id_return_no_id_attribute_error() {
+ let html = Html::parse_fragment(&get_test_article_4());
+ // Change the selector for tests pruposes since we cannot
+ // use id
+ let selector = Selector::parse(".eli-subdivision").unwrap();
+ let element_ref = html.select(&selector).next().unwrap();
+ let article_left = ArticleParser::parse(element_ref).unwrap_err();
+ let article_right = ArticleParserError::NoIdAttribute;
+ assert_eq!(article_left, article_right)
+ }
}