commit - f3bb73ef08e4bd67cf31d05f22684185c2602d24
commit + 328366910293466b36dbfcb39511b8b6bc273550
blob - 9e74a47e650e208d76c209b89754a9e632484150
blob + 05395d79f03075225880582debaf53dc5ddf5eba
--- eur-lex-scraper/src/models/acts.rs
+++ eur-lex-scraper/src/models/acts.rs
-use crate::models::preambles::Preamble;
+use crate::models::{enacting_terms::EnactingTerms, preambles::Preamble};
#[derive(Default, Debug, Clone, PartialEq)]
pub struct EUAct {
pub title: String,
pub preamble: Preamble,
+ pub enacting_terms: EnactingTerms,
}
blob - 9033263b69faa760527389cc64fe58f760605606
blob + 2b17d016d325384df22aeedde464d4682028a58c
--- eur-lex-scraper/src/models/articles.rs
+++ eur-lex-scraper/src/models/articles.rs
use crate::models::enacting_terms::Item;
-#[derive(Debug, Default, PartialEq, Eq)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Article {
pub title: String,
pub number: u32,
blob - 0e4d0727da318cfba1bf97599d6c313d32ee3971
blob + be404e25c4d19e77c886af9625174c2655747147
--- eur-lex-scraper/src/models/enacting_terms.rs
+++ eur-lex-scraper/src/models/enacting_terms.rs
use crate::models::articles::Article;
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Item {
Article(Article),
Chapter(Chapter),
}
}
-#[derive(Debug, Default, PartialEq, Eq)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct EnactingTerms {
pub items: Vec<Item>,
}
}
}
-#[derive(Debug, Default, PartialEq, Eq)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Chapter {
pub items: Vec<Item>,
}
}
}
-#[derive(Debug, Default, PartialEq, Eq)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Section {
pub items: Vec<Article>,
}
blob - 61f872aec5803980ed69405184f3191d255ff1cc
blob + a3b88f56fc9ba356d030385d3c405172de36f99c
--- eur-lex-scraper/src/parsers/act.rs
+++ eur-lex-scraper/src/parsers/act.rs
use crate::models::acts::EUAct;
+use crate::parsers::enacting_terms::EnactingTermParser;
use crate::parsers::{
act_title::{EUActTileParser, EUActTitleParserError},
preamble::PreambleParser,
let preamble_section = act_html.select(&preamble_selector).next().unwrap();
let preamble = PreambleParser::parse(preamble_section).unwrap();
- Ok(EUAct { title, preamble })
+ // /////////////
+ // Get enacting terms
+ let enacting_terms_selector = Selector::parse("#enc_1").unwrap();
+ let enacting_terms_section = act_html.select(&enacting_terms_selector).next().unwrap();
+ let enacting_terms = EnactingTermParser::parse(enacting_terms_section).unwrap();
+
+ Ok(EUAct {
+ title,
+ preamble,
+ enacting_terms,
+ })
}
}
let title = get_act_title();
let act = EUActParser::parse(&get_act_html()).unwrap();
assert_eq!(act.title, title);
+ assert_eq!(act.enacting_terms.items.len(), 13);
}
}
blob - 0bff7ef487c062276c514afa7897c63cf440e898
blob + 89bcd19843271bc087ffb5a7cdd9ea05f2becce1
--- eur-lex-scraper/src/parsers/act_title.rs
+++ eur-lex-scraper/src/parsers/act_title.rs
-use crate::models::{acts::EUAct, preambles::Preamble};
use nanohtml2text::html2text;
-use scraper::{ElementRef, Html, Selector};
+use scraper::{ElementRef, Selector};
use thiserror::Error;
#[derive(Error, Debug)]
}
}
-#[derive(Error, Debug)]
-pub enum EUActParserError {
- #[error("error while parsing title: {0}")]
- TitleError(EUActTitleParserError),
-}
-
-impl From<EUActTitleParserError> for EUActParserError {
- fn from(value: EUActTitleParserError) -> Self {
- EUActParserError::TitleError(value)
- }
-}
-
-pub struct EUActParser {}
-
-impl EUActParser {
- pub fn parse(html: &str) -> Result<EUAct, EUActParserError> {
- let act_html = Html::parse_document(html);
-
- // //////////////
- // Get act title
- let title_selector = Selector::parse(".eli-main-title").unwrap();
- let title_element = act_html.select(&title_selector).next().unwrap();
- let title = EUActTileParser::parse(title_element)?;
- Ok(EUAct {
- title,
- preamble: Preamble::default(),
- })
- }
-}
-
#[cfg(test)]
mod tests {
use std::fs;
use super::*;
+ use crate::parsers::act::EUActParser;
fn get_act_html_simple() -> String {
fs::read_to_string("data/test_act_simple.html").unwrap()
blob - 85aa9edf97b6993d072ce51b24010110ef6d1c2d
blob + 8a51f9f779ddd722b54f402d244d3884f8005d91
--- eur-lex-scraper/src/parsers/chapter.rs
+++ eur-lex-scraper/src/parsers/chapter.rs
pub fn parse(element: ElementRef) -> Result<Chapter, ChapterParserError> {
let mut chapter = Chapter::default();
let section_selector =
- Selector::parse(r#"[id^="cpt_"][id*="sct_"]:not([id*="tit_"])}"#).unwrap();
+ Selector::parse(r#"[id^="cpt_"][id*="sct_"]:not([id*="tit_"])"#).unwrap();
let section_count = element.select(§ion_selector).count();
// If there are sections, parse them. Otherwise, it must be articles.
if section_count > 0 {
blob - 1e54aa19a2acfbddd28779e7d31ff3c84cb06dc9
blob + 1c7c1d21b0adfecb2a355453f4306ac9ba4ebea0
--- eur-lex-scraper/src/parsers/section.rs
+++ eur-lex-scraper/src/parsers/section.rs
section_left.items.get(1).unwrap().title,
"Amendments to Annex III".to_string()
);
-
- /*
- let article_right = Article {
- number: 1,
- text: html2text(&element_ref.inner_html()),
- };
- assert_eq!(article_left, article_right)
- */
}
}