commit d30b6c2cbca773270d71c6bc757ff036cbcf1188 from: Romain VINCENT date: Fri Nov 14 21:41:03 2025 UTC First implementation of Recital. commit - 321930bbf7f63de99fec843db5f596dc8413f192 commit + d30b6c2cbca773270d71c6bc757ff036cbcf1188 blob - /dev/null blob + 96ef6c0b944e24fc22f51f18136cd62ffd5b0b8f (mode 644) --- /dev/null +++ .gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock blob - ea8c4bf7f35f6f77f75d92ad8ce8349f6e81ddba (mode 644) blob + /dev/null --- eur-lex-scrapper/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target blob - ee95352c6774ebd21c75249b29f4485315f1c32d blob + 314232c6bfbbd58efebf31d5176890ac7e518b97 --- eur-lex-scrapper/Cargo.toml +++ eur-lex-scrapper/Cargo.toml @@ -4,3 +4,6 @@ version = "0.1.0" edition = "2024" [dependencies] +nanohtml2text = "0.2.1" +scraper = "0.24.0" +thiserror = "2.0.17" blob - b93cf3ffd9cc9c59f584a92d7bd1459d5521ef4e blob + c446ac88338ad03abd49d91d75980f521a296caf --- eur-lex-scrapper/src/lib.rs +++ eur-lex-scrapper/src/lib.rs @@ -1,14 +1 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +pub mod models; blob - /dev/null blob + 702d54175cb3b9f0805b4bd8801179c3b57757ad (mode 644) --- /dev/null +++ eur-lex-scrapper/src/models/mod.rs @@ -0,0 +1 @@ +pub mod recitals; blob - /dev/null blob + f2c7cba5aa1d41dd3e991ed749d15af6510966db (mode 644) --- /dev/null +++ eur-lex-scrapper/src/models/recitals.rs @@ -0,0 +1,148 @@ +use nanohtml2text::html2text; +use scraper::{Html, Selector}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum RecitalParsingError { + #[error("failed extracting number")] + ExtractingNumber, + #[error("failed extracting text")] + ExtractingText, +} + +pub fn sanatize(mut text: String) -> String { + // Remove possible artefacts + text = text.replace(" .", "."); + text = text.replace(" ,", ","); + text = text.replace("  ", " "); + text.to_string() +} + +#[derive(Debug, Default)] +pub struct Recital { + pub number: i32, + pub text: String, +} + +impl Recital { + pub fn from_str(html: &str) -> Result { + let mut recital = Recital::default(); + + // ////////////////// + // Get recital number + // + let fragment = Html::parse_fragment(html); + let selector_content = Selector::parse(".oj-normal").unwrap(); + let mut frag = fragment.select(&selector_content); + let number = match frag.next() { + Some(number) => number, + None => return Err(RecitalParsingError::ExtractingNumber), + }; + let number = number + .inner_html() + .trim_matches('(') + .trim_matches(')') + .to_string(); + if let Ok(number) = number.parse::() { + recital.number = number; + }; + + // /////////////////// + // Get recital content + // + let text = match frag.next() { + Some(text) => text, + None => return Err(RecitalParsingError::ExtractingText), + }; + + let mut text_html = text.inner_html(); + // Remove href citations from html code + for citation in text.child_elements() { + text_html = text_html.replace(&citation.html(), ""); + } + text_html = sanatize(text_html); + recital.text = html2text(&text_html); + + Ok(recital) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn get_recital_html() -> String { + let recital_html = r#" +
+ + + + + + + + + + + +
+

(28)

+
+

Aside from the many beneficial uses of AI, it can also be misused and provide novel and powerful tools for manipulative, exploitative and social control practices. Such practices are particularly harmful and abusive and should be prohibited because they contradict Union values of respect for human dignity, freedom, equality, democracy and the rule of law and fundamental rights enshrined in the Charter, including the right to non-discrimination, to data protection and to privacy and the rights of the child.

+
+
+ "#; + recital_html.into() + } + + fn get_recital_html_bis() -> String { + let recital_html = r#" +
+ + + + + + + + + + + +
+

(9)

+
+

Harmonised rules applicable to the placing on the market, the putting into service and the use of high-risk AI systems should be laid down consistently with Regulation (EC) No 765/2008 of the European Parliament and of the Council (7), Decision No 768/2008/EC of the European Parliament and of the Council (8) and Regulation (EU) 2019/1020 of the European Parliament and of the Council (9) (New Legislative Framework).

+
+
+ "#; + recital_html.into() + } + + #[test] + fn recital_parsing_text() { + let text = "Aside from the many beneficial uses of AI, it can also be misused and provide novel and powerful tools for manipulative, exploitative and social control practices. Such practices are particularly harmful and abusive and should be prohibited because they contradict Union values of respect for human dignity, freedom, equality, democracy and the rule of law and fundamental rights enshrined in the Charter, including the right to non-discrimination, to data protection and to privacy and the rights of the child."; + let html = get_recital_html(); + let recital = Recital::from_str(&html).unwrap(); + assert_eq!(recital.text, text.to_string()); + } + #[test] + fn recital_parsing_number() { + let html = get_recital_html(); + let recital = Recital::from_str(&html).unwrap(); + assert_eq!(recital.number, 28); + } + #[test] + fn recital_parsing_text_with_citation() { + let text = "Harmonised rules applicable to the placing on the market, the putting into service and the use of high-risk AI systems should be laid down consistently with Regulation (EC)\u{a0}No\u{a0}765/2008 of the European Parliament and of the Council, Decision\u{a0}No\u{a0}768/2008/EC of the European Parliament and of the Council and Regulation\u{a0}(EU) 2019/1020 of the European Parliament and of the Council (New Legislative Framework)."; + let html = get_recital_html_bis(); + let recital = Recital::from_str(&html).unwrap(); + assert_eq!(recital.text, text.to_string()); + } + #[test] + fn recital_parsing_number_with_citation() { + let html = get_recital_html_bis(); + let recital = Recital::from_str(&html).unwrap(); + assert_eq!(recital.number, 9); + } +}