commit 97618e6ac9f8771f0da9e3dedc2a385983d1cb0f from: Romain VINCENT date: Fri Jan 16 07:15:06 2026 UTC First POC for complete Act parsing done. commit - beca2d924c6ed4ad96dc9cf4d26ffe711a82d841 commit + 97618e6ac9f8771f0da9e3dedc2a385983d1cb0f blob - f0222648eb27d3bf21d1157bb87a81e269c94ae4 blob + 8cd11075c2c7fd3c11a88b0a0eb9cda1c364dae0 --- eur-lex-scraper/src/parsers/enacting_terms_parser.rs +++ eur-lex-scraper/src/parsers/enacting_terms_parser.rs @@ -11,19 +11,42 @@ pub struct EnactingTermParser {} pub enum EnactingTermParserError { #[error("error while parsing enacting term")] GenericError, + #[error("error while parsing chapter")] + ChapterError(ChapterParserError), + #[error("error while parsing aricles")] + ArticleError(ArticleParserError), } +impl From for EnactingTermParserError { + fn from(value: ChapterParserError) -> Self { + EnactingTermParserError::ChapterError(value) + } +} + +impl From for EnactingTermParserError { + fn from(value: ArticleParserError) -> Self { + EnactingTermParserError::ArticleError(value) + } +} + impl EnactingTermParser { pub fn parse(element: ElementRef) -> Result { let mut enacting_terms = EnactingTerms::default(); let chapter_selector = Selector::parse("[id^=cpt_]").unwrap(); let chapter_count = element.select(&chapter_selector).count(); if chapter_count > 0 { - // should parse chapter - todo!() + for chapter in element.select(&chapter_selector) { + let chapter = ChapterParser::parse(chapter)?; + enacting_terms.push(chapter); + } } else { - // should parse article - todo!() + // See following document for document with articles only + // https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32006D0443 + let article_selector = Selector::parse(r#"[id^="art_"]:not([id*=".tit"])"#).unwrap(); + for article in element.select(&article_selector) { + let article = ArticleParser::parse(article)?; + enacting_terms.push(article); + } } Ok(enacting_terms) } @@ -66,7 +89,8 @@ impl ChapterParser { chapter.push(section); } } else { - let article_selector = Selector::parse(r#"[id^="art_"]:not([id*=".tit"])"#).unwrap(); + let article_selector = + Selector::parse(r#"[id^="cpt_"][id*="art_"]:not([id*=".tit"])"#).unwrap(); for article in element.select(&article_selector) { let article = ArticleParser::parse(article)?; chapter.push(article);