Commit Diff


commit - 78f7dcfd62590dd043b3efe7fedd36d806be1f21
commit + e68a91b7cceaaf34566b92ac6846fc8867101905
blob - /dev/null
blob + 5a79d321fb394747adfb74effce9ce35f2a81315 (mode 644)
--- /dev/null
+++ eur-lex-scraper/data/tests/parsers/article/article_test_5.html
@@ -0,0 +1,7 @@
+<div class="eli-subdivision" id="art_4">
+   <p class="oj-ti-art">Article 4</p>
+   <div class="eli-title" id="art_4.tit_1">
+      <p class="oj-sti-art">AI literacy</p>
+   </div>
+   <p class="oj-normal">Providers and deployers of AI systems shall take measures to ensure, to their best extent, a sufficient level of AI literacy of their staff and other persons dealing with the operation and use of AI systems on their behalf, taking into account their technical knowledge, experience, education and training and the context the AI systems are to be used in, and considering the persons or groups of persons on whom the AI systems are to be used.</p>
+</div>
blob - de9cd82ed7dea22d3523df97e9c74e45021c1c24
blob + 08dd85afcd79845f4d650cce271c95c81d0b9fe6
--- eur-lex-scraper/src/models/articles.rs
+++ eur-lex-scraper/src/models/articles.rs
@@ -2,6 +2,7 @@ use crate::models::enacting_terms::Item;
 
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct Article {
+    id: String,
     title: String,
     number: u32,
     text: String,
@@ -14,6 +15,12 @@ impl Into<Item> for Article {
 }
 
 impl Article {
+    pub fn set_id(&mut self, id: String) {
+        self.id = id
+    }
+    pub fn get_id(&self) -> &str {
+        &self.id
+    }
     pub fn set_title(&mut self, title: String) {
         self.title = title
     }
blob - f88578d44278fb36d2970b50c7bf419fa2786329
blob + 4d9cc33fcdb4590872766d17235ec0487d46c987
--- eur-lex-scraper/src/parsers/article.rs
+++ eur-lex-scraper/src/parsers/article.rs
@@ -7,6 +7,8 @@ pub struct ArticleParser {}
 
 #[derive(Error, Debug, PartialEq, PartialOrd)]
 pub enum ArticleParserError {
+    #[error("error while parsing article unique id")]
+    UniqueIdError,
     #[error("error while parsing article title")]
     TitleError,
     #[error("error while parsing article number")]
@@ -26,6 +28,14 @@ impl ArticleParser {
             Ok(number) => number,
             Err(_) => return Err(ArticleParserError::ErrorNumber),
         };
+        let article_id_selector = Selector::parse(r#".oj-ti-art"#).unwrap();
+        let id = match element.select(&article_id_selector).next() {
+            Some(id) => match id.attr("id") {
+                Some(id) => id.to_string(),
+                None => return Err(ArticleParserError::UniqueIdError),
+            },
+            None => return Err(ArticleParserError::UniqueIdError),
+        };
         let article_title_selector = Selector::parse(r#".oj-sti-art"#).unwrap();
         let title = match element.select(&article_title_selector).next() {
             Some(title) => title.inner_html(),
@@ -33,6 +43,7 @@ impl ArticleParser {
         };
         let text = html2text(&element.inner_html());
         let mut article = Article::default();
+        article.set_id(id);
         article.set_title(title);
         article.set_number(number);
         article.set_text(text);
@@ -59,10 +70,14 @@ mod tests {
     fn get_test_article_3() -> String {
         fs::read_to_string("data/tests/parsers/article/article_test_3.html").unwrap()
     }
-    // Article with no id attribute
+    // Article with no id for attribute for number extraction
     fn get_test_article_4() -> String {
         fs::read_to_string("data/tests/parsers/article/article_test_4.html").unwrap()
     }
+    // Article with no id for attribute for id extraction
+    fn get_test_article_5() -> String {
+        fs::read_to_string("data/tests/parsers/article/article_test_5.html").unwrap()
+    }
 
     #[test]
     fn parsing_article_1_well_formed() {
@@ -71,6 +86,7 @@ mod tests {
         let element_ref = html.select(&selector).next().unwrap();
         let article_left = ArticleParser::parse(element_ref).unwrap();
         let mut article_right = Article::default();
+        article_right.set_id("d1e2795-1-1".to_string());
         article_right.set_title("AI literacy".to_string());
         article_right.set_number(4);
         article_right.set_text(html2text(&element_ref.inner_html()));
@@ -95,7 +111,7 @@ mod tests {
         assert_eq!(article_left, article_right)
     }
     #[test]
-    fn parsing_article_no_id_return_no_id_attribute_error() {
+    fn parsing_article_no_id_for_number_return_no_id_attribute_for_error() {
         let html = Html::parse_fragment(&get_test_article_4());
         // Change the selector for test pruposes since we cannot use id
         let selector = Selector::parse(".eli-subdivision").unwrap();
@@ -104,4 +120,14 @@ mod tests {
         let article_right = ArticleParserError::NoIdAttribute;
         assert_eq!(article_left, article_right)
     }
+    #[test]
+    fn parsing_article_no_unique_id_return_no_unique_id_for_error() {
+        let html = Html::parse_fragment(&get_test_article_5());
+        // Change the selector for test pruposes since we cannot use id
+        let selector = Selector::parse(".eli-subdivision").unwrap();
+        let element_ref = html.select(&selector).next().unwrap();
+        let article_left = ArticleParser::parse(element_ref).unwrap_err();
+        let article_right = ArticleParserError::UniqueIdError;
+        assert_eq!(article_left, article_right)
+    }
 }