Commit Diff


commit - 00ba4d7d8dbcd8686722b922f2f749431f8575dd
commit + 6a3f14362066a116d40e8e68249bb6611aac5847
blob - 4007eb565cabc9255e60815502b62af9666c2740
blob + 4da206bae0b7e09bcf6f92c6c012c644330f763d
--- eur-lex-scraper-naive/src/parsers/enacting_terms_parser.rs
+++ eur-lex-scraper-naive/src/parsers/enacting_terms_parser.rs
@@ -1,11 +1,8 @@
-use nanohtml2text::html2text;
 use scraper::{ElementRef, Selector};
 use thiserror::Error;
 
-use crate::models::{
-    articles::Article,
-    enacting_terms::{Chapter, EnactingTerms, Section},
-};
+use crate::models::enacting_terms::{Chapter, EnactingTerms, Section};
+use crate::parsers::article::{ArticleParser, ArticleParserError};
 
 pub struct EnactingTermParser {}
 
@@ -37,8 +34,18 @@ pub struct ChapterParser {}
 pub enum ChapterParserError {
     #[error("error while parsing chapter")]
     GenericError,
+    #[error("error while parsing section")]
+    SectionError(SectionParserError),
+    #[error("error while parsing article")]
+    ArticleError(),
 }
 
+impl From<SectionParserError> for ChapterParserError {
+    fn from(value: SectionParserError) -> Self {
+        ChapterParserError::SectionError(value)
+    }
+}
+
 impl ChapterParser {
     pub fn parse(element: ElementRef) -> Result<Chapter, ChapterParserError> {
         let mut chapter = Chapter::default();
@@ -46,8 +53,10 @@ impl ChapterParser {
         let section_selector = Selector::parse("[id^=cpt_]").unwrap();
         let section_count = element.select(&section_selector).count();
         if section_count > 0 {
-            // should parse section
-            todo!()
+            for section in element.select(&section_selector) {
+                let section = SectionParser::parse(section)?;
+                chapter.push(section);
+            }
         } else {
             // should parse article
             todo!()
@@ -66,6 +75,12 @@ pub enum SectionParserError {
     ArticleError(ArticleParserError),
 }
 
+impl From<ArticleParserError> for SectionParserError {
+    fn from(value: ArticleParserError) -> Self {
+        Self::ArticleError(value)
+    }
+}
+
 impl SectionParser {
     pub fn parse(element: ElementRef) -> Result<Section, SectionParserError> {
         let mut section = Section::default();
@@ -77,36 +92,3 @@ impl SectionParser {
         Ok(section)
     }
 }
-
-pub struct ArticleParser {}
-
-#[derive(Error, Debug, PartialEq, PartialOrd)]
-pub enum ArticleParserError {
-    #[error("error while parsing article")]
-    GenericError,
-    #[error("error while parsing article number")]
-    ErrorNumber,
-}
-
-impl From<ArticleParserError> for SectionParserError {
-    fn from(value: ArticleParserError) -> Self {
-        Self::ArticleError(value)
-    }
-}
-
-impl ArticleParser {
-    pub fn parse(element: ElementRef) -> Result<Article, ArticleParserError> {
-        let id = match element.attr("id") {
-            Some(id) => id.to_string(),
-            None => return Err(ArticleParserError::ErrorNumber),
-        };
-        let number_str = id.replace("art_", "");
-        let number: u32 = match number_str.parse() {
-            Ok(number) => number,
-            Err(_) => return Err(ArticleParserError::ErrorNumber),
-        };
-        let text = html2text(&element.inner_html());
-        let article = Article { number, text };
-        Ok(article)
-    }
-}
blob - /dev/null
blob + 15c941f6a64b1aca3a0d207f5af15de9656aa85a (mode 644)
--- /dev/null
+++ eur-lex-scraper-naive/src/parsers/article.rs
@@ -0,0 +1,172 @@
+use crate::models::articles::Article;
+use nanohtml2text::html2text;
+use scraper::ElementRef;
+use thiserror::Error;
+
+pub struct ArticleParser {}
+
+#[derive(Error, Debug, PartialEq, PartialOrd)]
+pub enum ArticleParserError {
+    #[error("error while parsing article")]
+    GenericError,
+    #[error("error while parsing article number")]
+    ErrorNumber,
+}
+
+impl ArticleParser {
+    pub fn parse(element: ElementRef) -> Result<Article, ArticleParserError> {
+        let id = match element.attr("id") {
+            Some(id) => id.to_string(),
+            None => return Err(ArticleParserError::ErrorNumber),
+        };
+        let number_str = id.replace("art_", "");
+        let number: u32 = match number_str.parse() {
+            Ok(number) => number,
+            Err(_) => return Err(ArticleParserError::ErrorNumber),
+        };
+        let text = html2text(&element.inner_html());
+        let article = Article { number, text };
+        Ok(article)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use scraper::{Html, Selector};
+
+    fn get_article1_html() -> String {
+        let article_html = r#"
+               <div class="eli-title" id="cpt_I.tit_1">
+                  <p id="L_01689EN.01000101-d-001" class="oj-ti-section-2">
+                     <span class="oj-bold">GENERAL PROVISIONS</span>
+                  </p>
+               </div>
+               <div class="eli-subdivision" id="art_1">
+                  <p id="d1e1915-1-1" class="oj-ti-art">Article 1</p>
+                  <div class="eli-title" id="art_1.tit_1">
+                     <p class="oj-sti-art">Subject matter`</p>
+                  </div>
+                  <div id="001.001">
+                     <p class="oj-normal">1.   The purpose of this Regulation is to improve the functioning of the internal market and promote the uptake of human-centric and trustworthy artificial intelligence (AI), while ensuring a high level of protection of health, safety, fundamental rights enshrined in the Charter, including democracy, the rule of law and environmental protection, against the harmful effects of AI systems in the Union and supporting innovation.</p>
+                  </div>
+                  <div id="001.002">
+                     <p class="oj-normal">2.   This Regulation lays down:</p>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(a)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">harmonised rules for the placing on the market, the putting into service, and the use of AI systems in the Union;</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(b)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">prohibitions of certain AI practices;</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(c)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">specific requirements for high-risk AI systems and obligations for operators of such systems;</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(d)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">harmonised transparency rules for certain AI systems;</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(e)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">harmonised rules for the placing on the market of general-purpose AI models;</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(f)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">rules on market monitoring, market surveillance, governance and enforcement;</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <col width="4%"/>
+                        <col width="96%"/>
+                        <tbody>
+                           <tr>
+                              <td valign="top"  >
+                                 <p class="oj-normal">(g)</p>
+                              </td>
+                              <td valign="top"  >
+                                 <p class="oj-normal">measures to support innovation, with a particular focus on SMEs, including start-ups.</p>
+                              </td>
+                           </tr>
+                        </tbody>
+                     </table>
+                  </div>
+               </div>
+        "#;
+        article_html.to_string()
+    }
+
+    #[test]
+    fn item_parsing_citation() {
+        let html = Html::parse_fragment(&get_article1_html());
+        let selector = Selector::parse("[id^=art_]").unwrap();
+        let element_ref = html.select(&selector).next().unwrap();
+        let article_left = ArticleParser::parse(element_ref).unwrap();
+        let article_right = Article {
+            number: 1,
+            text: html2text(&element_ref.inner_html()),
+        };
+        assert_eq!(article_left, article_right)
+    }
+}
blob - 5a3db55689fda2936bdd6c6713cd56eef59b1d1c
blob + beb3511efeac1aea3dded27c39b515121ce0a6a2
--- eur-lex-scraper-naive/src/parsers/mod.rs
+++ eur-lex-scraper-naive/src/parsers/mod.rs
@@ -1,5 +1,6 @@
 pub mod act_parser;
 pub mod act_title_parser;
+pub mod article;
 pub mod enacting_terms_parser;
 pub mod preamble_item_parser;
 pub mod preamble_parser;