commit 6a3f14362066a116d40e8e68249bb6611aac5847 from: Romain VINCENT date: Sat Jan 10 12:38:20 2026 UTC Start splitting into separate files. Begin writting tests. commit - 00ba4d7d8dbcd8686722b922f2f749431f8575dd commit + 6a3f14362066a116d40e8e68249bb6611aac5847 blob - 4007eb565cabc9255e60815502b62af9666c2740 blob + 4da206bae0b7e09bcf6f92c6c012c644330f763d --- eur-lex-scraper-naive/src/parsers/enacting_terms_parser.rs +++ eur-lex-scraper-naive/src/parsers/enacting_terms_parser.rs @@ -1,11 +1,8 @@ -use nanohtml2text::html2text; use scraper::{ElementRef, Selector}; use thiserror::Error; -use crate::models::{ - articles::Article, - enacting_terms::{Chapter, EnactingTerms, Section}, -}; +use crate::models::enacting_terms::{Chapter, EnactingTerms, Section}; +use crate::parsers::article::{ArticleParser, ArticleParserError}; pub struct EnactingTermParser {} @@ -37,8 +34,18 @@ pub struct ChapterParser {} pub enum ChapterParserError { #[error("error while parsing chapter")] GenericError, + #[error("error while parsing section")] + SectionError(SectionParserError), + #[error("error while parsing article")] + ArticleError(), } +impl From for ChapterParserError { + fn from(value: SectionParserError) -> Self { + ChapterParserError::SectionError(value) + } +} + impl ChapterParser { pub fn parse(element: ElementRef) -> Result { let mut chapter = Chapter::default(); @@ -46,8 +53,10 @@ impl ChapterParser { let section_selector = Selector::parse("[id^=cpt_]").unwrap(); let section_count = element.select(§ion_selector).count(); if section_count > 0 { - // should parse section - todo!() + for section in element.select(§ion_selector) { + let section = SectionParser::parse(section)?; + chapter.push(section); + } } else { // should parse article todo!() @@ -66,6 +75,12 @@ pub enum SectionParserError { ArticleError(ArticleParserError), } +impl From for SectionParserError { + fn from(value: ArticleParserError) -> Self { + Self::ArticleError(value) + } +} + impl SectionParser { pub fn parse(element: ElementRef) -> Result { let mut section = Section::default(); @@ -77,36 +92,3 @@ impl SectionParser { Ok(section) } } - -pub struct ArticleParser {} - -#[derive(Error, Debug, PartialEq, PartialOrd)] -pub enum ArticleParserError { - #[error("error while parsing article")] - GenericError, - #[error("error while parsing article number")] - ErrorNumber, -} - -impl From for SectionParserError { - fn from(value: ArticleParserError) -> Self { - Self::ArticleError(value) - } -} - -impl ArticleParser { - pub fn parse(element: ElementRef) -> Result { - let id = match element.attr("id") { - Some(id) => id.to_string(), - None => return Err(ArticleParserError::ErrorNumber), - }; - let number_str = id.replace("art_", ""); - let number: u32 = match number_str.parse() { - Ok(number) => number, - Err(_) => return Err(ArticleParserError::ErrorNumber), - }; - let text = html2text(&element.inner_html()); - let article = Article { number, text }; - Ok(article) - } -} blob - /dev/null blob + 15c941f6a64b1aca3a0d207f5af15de9656aa85a (mode 644) --- /dev/null +++ eur-lex-scraper-naive/src/parsers/article.rs @@ -0,0 +1,172 @@ +use crate::models::articles::Article; +use nanohtml2text::html2text; +use scraper::ElementRef; +use thiserror::Error; + +pub struct ArticleParser {} + +#[derive(Error, Debug, PartialEq, PartialOrd)] +pub enum ArticleParserError { + #[error("error while parsing article")] + GenericError, + #[error("error while parsing article number")] + ErrorNumber, +} + +impl ArticleParser { + pub fn parse(element: ElementRef) -> Result { + let id = match element.attr("id") { + Some(id) => id.to_string(), + None => return Err(ArticleParserError::ErrorNumber), + }; + let number_str = id.replace("art_", ""); + let number: u32 = match number_str.parse() { + Ok(number) => number, + Err(_) => return Err(ArticleParserError::ErrorNumber), + }; + let text = html2text(&element.inner_html()); + let article = Article { number, text }; + Ok(article) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use scraper::{Html, Selector}; + + fn get_article1_html() -> String { + let article_html = r#" +
+

+ GENERAL PROVISIONS +

+
+
+

Article 1

+
+

Subject matter`

+
+
+

1.   The purpose of this Regulation is to improve the functioning of the internal market and promote the uptake of human-centric and trustworthy artificial intelligence (AI), while ensuring a high level of protection of health, safety, fundamental rights enshrined in the Charter, including democracy, the rule of law and environmental protection, against the harmful effects of AI systems in the Union and supporting innovation.

+
+
+

2.   This Regulation lays down:

+ + + + + + + + + +
+

(a)

+
+

harmonised rules for the placing on the market, the putting into service, and the use of AI systems in the Union;

+
+ + + + + + + + + +
+

(b)

+
+

prohibitions of certain AI practices;

+
+ + + + + + + + + +
+

(c)

+
+

specific requirements for high-risk AI systems and obligations for operators of such systems;

+
+ + + + + + + + + +
+

(d)

+
+

harmonised transparency rules for certain AI systems;

+
+ + + + + + + + + +
+

(e)

+
+

harmonised rules for the placing on the market of general-purpose AI models;

+
+ + + + + + + + + +
+

(f)

+
+

rules on market monitoring, market surveillance, governance and enforcement;

+
+ + + + + + + + + +
+

(g)

+
+

measures to support innovation, with a particular focus on SMEs, including start-ups.

+
+
+
+ "#; + article_html.to_string() + } + + #[test] + fn item_parsing_citation() { + let html = Html::parse_fragment(&get_article1_html()); + let selector = Selector::parse("[id^=art_]").unwrap(); + let element_ref = html.select(&selector).next().unwrap(); + let article_left = ArticleParser::parse(element_ref).unwrap(); + let article_right = Article { + number: 1, + text: html2text(&element_ref.inner_html()), + }; + assert_eq!(article_left, article_right) + } +} blob - 5a3db55689fda2936bdd6c6713cd56eef59b1d1c blob + beb3511efeac1aea3dded27c39b515121ce0a6a2 --- eur-lex-scraper-naive/src/parsers/mod.rs +++ eur-lex-scraper-naive/src/parsers/mod.rs @@ -1,5 +1,6 @@ pub mod act_parser; pub mod act_title_parser; +pub mod article; pub mod enacting_terms_parser; pub mod preamble_item_parser; pub mod preamble_parser;