commit 328366910293466b36dbfcb39511b8b6bc273550 from: Romain VINCENT date: Fri Jan 16 07:54:03 2026 UTC Reordering a bit (test fail on enacting terms for now). commit - f3bb73ef08e4bd67cf31d05f22684185c2602d24 commit + 328366910293466b36dbfcb39511b8b6bc273550 blob - 9e74a47e650e208d76c209b89754a9e632484150 blob + 05395d79f03075225880582debaf53dc5ddf5eba --- eur-lex-scraper/src/models/acts.rs +++ eur-lex-scraper/src/models/acts.rs @@ -1,7 +1,8 @@ -use crate::models::preambles::Preamble; +use crate::models::{enacting_terms::EnactingTerms, preambles::Preamble}; #[derive(Default, Debug, Clone, PartialEq)] pub struct EUAct { pub title: String, pub preamble: Preamble, + pub enacting_terms: EnactingTerms, } blob - 9033263b69faa760527389cc64fe58f760605606 blob + 2b17d016d325384df22aeedde464d4682028a58c --- eur-lex-scraper/src/models/articles.rs +++ eur-lex-scraper/src/models/articles.rs @@ -1,6 +1,6 @@ use crate::models::enacting_terms::Item; -#[derive(Debug, Default, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct Article { pub title: String, pub number: u32, blob - 0e4d0727da318cfba1bf97599d6c313d32ee3971 blob + be404e25c4d19e77c886af9625174c2655747147 --- eur-lex-scraper/src/models/enacting_terms.rs +++ eur-lex-scraper/src/models/enacting_terms.rs @@ -1,6 +1,6 @@ use crate::models::articles::Article; -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum Item { Article(Article), Chapter(Chapter), @@ -34,7 +34,7 @@ impl Item { } } -#[derive(Debug, Default, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct EnactingTerms { pub items: Vec, } @@ -45,7 +45,7 @@ impl EnactingTerms { } } -#[derive(Debug, Default, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct Chapter { pub items: Vec, } @@ -62,7 +62,7 @@ impl Into for Chapter { } } -#[derive(Debug, Default, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct Section { pub items: Vec
, } blob - 61f872aec5803980ed69405184f3191d255ff1cc blob + a3b88f56fc9ba356d030385d3c405172de36f99c --- eur-lex-scraper/src/parsers/act.rs +++ eur-lex-scraper/src/parsers/act.rs @@ -1,4 +1,5 @@ use crate::models::acts::EUAct; +use crate::parsers::enacting_terms::EnactingTermParser; use crate::parsers::{ act_title::{EUActTileParser, EUActTitleParserError}, preamble::PreambleParser, @@ -37,7 +38,17 @@ impl EUActParser { let preamble_section = act_html.select(&preamble_selector).next().unwrap(); let preamble = PreambleParser::parse(preamble_section).unwrap(); - Ok(EUAct { title, preamble }) + // ///////////// + // Get enacting terms + let enacting_terms_selector = Selector::parse("#enc_1").unwrap(); + let enacting_terms_section = act_html.select(&enacting_terms_selector).next().unwrap(); + let enacting_terms = EnactingTermParser::parse(enacting_terms_section).unwrap(); + + Ok(EUAct { + title, + preamble, + enacting_terms, + }) } } @@ -70,5 +81,6 @@ mod tests { let title = get_act_title(); let act = EUActParser::parse(&get_act_html()).unwrap(); assert_eq!(act.title, title); + assert_eq!(act.enacting_terms.items.len(), 13); } } blob - 0bff7ef487c062276c514afa7897c63cf440e898 blob + 89bcd19843271bc087ffb5a7cdd9ea05f2becce1 --- eur-lex-scraper/src/parsers/act_title.rs +++ eur-lex-scraper/src/parsers/act_title.rs @@ -1,6 +1,5 @@ -use crate::models::{acts::EUAct, preambles::Preamble}; use nanohtml2text::html2text; -use scraper::{ElementRef, Html, Selector}; +use scraper::{ElementRef, Selector}; use thiserror::Error; #[derive(Error, Debug)] @@ -32,41 +31,12 @@ impl EUActTileParser { } } -#[derive(Error, Debug)] -pub enum EUActParserError { - #[error("error while parsing title: {0}")] - TitleError(EUActTitleParserError), -} - -impl From for EUActParserError { - fn from(value: EUActTitleParserError) -> Self { - EUActParserError::TitleError(value) - } -} - -pub struct EUActParser {} - -impl EUActParser { - pub fn parse(html: &str) -> Result { - let act_html = Html::parse_document(html); - - // ////////////// - // Get act title - let title_selector = Selector::parse(".eli-main-title").unwrap(); - let title_element = act_html.select(&title_selector).next().unwrap(); - let title = EUActTileParser::parse(title_element)?; - Ok(EUAct { - title, - preamble: Preamble::default(), - }) - } -} - #[cfg(test)] mod tests { use std::fs; use super::*; + use crate::parsers::act::EUActParser; fn get_act_html_simple() -> String { fs::read_to_string("data/test_act_simple.html").unwrap() blob - 85aa9edf97b6993d072ce51b24010110ef6d1c2d blob + 8a51f9f779ddd722b54f402d244d3884f8005d91 --- eur-lex-scraper/src/parsers/chapter.rs +++ eur-lex-scraper/src/parsers/chapter.rs @@ -33,7 +33,7 @@ impl ChapterParser { pub fn parse(element: ElementRef) -> Result { let mut chapter = Chapter::default(); let section_selector = - Selector::parse(r#"[id^="cpt_"][id*="sct_"]:not([id*="tit_"])}"#).unwrap(); + Selector::parse(r#"[id^="cpt_"][id*="sct_"]:not([id*="tit_"])"#).unwrap(); let section_count = element.select(§ion_selector).count(); // If there are sections, parse them. Otherwise, it must be articles. if section_count > 0 { blob - 1e54aa19a2acfbddd28779e7d31ff3c84cb06dc9 blob + 1c7c1d21b0adfecb2a355453f4306ac9ba4ebea0 --- eur-lex-scraper/src/parsers/section.rs +++ eur-lex-scraper/src/parsers/section.rs @@ -442,13 +442,5 @@ mod tests { section_left.items.get(1).unwrap().title, "Amendments to Annex III".to_string() ); - - /* - let article_right = Article { - number: 1, - text: html2text(&element_ref.inner_html()), - }; - assert_eq!(article_left, article_right) - */ } }