Commit Diff


commit - 358471e349765354d6c4aa8172b94e27c2b2515c
commit + 78f7dcfd62590dd043b3efe7fedd36d806be1f21
blob - 59939a3bd4f42b8d2691492a71e9f1e56e6fa925
blob + c9fb979d512b95d86f965fe461e65d25d888b000
--- eur-lex-scraper/src/models/section.rs
+++ eur-lex-scraper/src/models/section.rs
@@ -2,6 +2,7 @@ use crate::models::{articles::Article, enacting_terms:
 
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct Section {
+    title: String,
     items: Vec<Article>,
 }
 
@@ -21,6 +22,12 @@ impl IntoIterator for Section {
 }
 
 impl Section {
+    pub fn set_title(&mut self, title: String) {
+        self.title = title;
+    }
+    pub fn get_title(&self) -> &str {
+        &self.title
+    }
     pub fn push(&mut self, article: Article) {
         self.items.push(article)
     }
blob - 161624a33298dea9b4ce298f8fa8d7975108ed54
blob + 0eaf1d07e47fd85d37c69b9cf1af59c94fd8b783
--- eur-lex-scraper/src/parsers/section.rs
+++ eur-lex-scraper/src/parsers/section.rs
@@ -10,6 +10,8 @@ pub struct SectionParser {}
 pub enum SectionParserError {
     #[error("error while parsing section")]
     GenericError,
+    #[error("error while parsing the title")]
+    TitleError,
     #[error("error parsing article")]
     ArticleError(ArticleParserError),
 }
@@ -23,6 +25,13 @@ impl From<ArticleParserError> for SectionParserError {
 impl SectionParser {
     pub fn parse(element: ElementRef) -> Result<Section, SectionParserError> {
         let mut section = Section::default();
+        // This class should appear only once per section
+        let section_title_selector = Selector::parse(r#".oj-ti-section-2"#).unwrap();
+        let title = match element.select(&section_title_selector).next() {
+            Some(title) => title.inner_html(),
+            None => return Err(SectionParserError::TitleError),
+        };
+        section.set_title(nanohtml2text::html2text(&title).trim().to_string());
         // select article but not titles
         let article_selector = Selector::parse(r#"[id^="art_"]:not([id*=".tit"])"#).unwrap();
         for article in element.select(&article_selector) {
@@ -44,11 +53,15 @@ mod tests {
     }
 
     #[test]
-    fn parsing_article() {
+    fn parsing_section_1() {
         let html = Html::parse_fragment(&get_test_section_1());
         let selector = Selector::parse(r#"[id*="sct_"]:not([id*=".tit_"])"#).unwrap();
         let element_ref = html.select(&selector).next().unwrap();
         let section_left = SectionParser::parse(element_ref).unwrap();
+        assert_eq!(
+            section_left.get_title(),
+            "Classification of AI systems as high-risk"
+        );
         assert_eq!(section_left.len(), 2);
         assert_eq!(section_left.get(0).unwrap().get_number(), 6);
         assert_eq!(