Commit Diff


commit - 321930bbf7f63de99fec843db5f596dc8413f192
commit + d30b6c2cbca773270d71c6bc757ff036cbcf1188
blob - /dev/null
blob + 96ef6c0b944e24fc22f51f18136cd62ffd5b0b8f (mode 644)
--- /dev/null
+++ .gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
blob - ea8c4bf7f35f6f77f75d92ad8ce8349f6e81ddba (mode 644)
blob + /dev/null
--- eur-lex-scrapper/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/target
blob - ee95352c6774ebd21c75249b29f4485315f1c32d
blob + 314232c6bfbbd58efebf31d5176890ac7e518b97
--- eur-lex-scrapper/Cargo.toml
+++ eur-lex-scrapper/Cargo.toml
@@ -4,3 +4,6 @@ version = "0.1.0"
 edition = "2024"
 
 [dependencies]
+nanohtml2text = "0.2.1"
+scraper = "0.24.0"
+thiserror = "2.0.17"
blob - b93cf3ffd9cc9c59f584a92d7bd1459d5521ef4e
blob + c446ac88338ad03abd49d91d75980f521a296caf
--- eur-lex-scrapper/src/lib.rs
+++ eur-lex-scrapper/src/lib.rs
@@ -1,14 +1 @@
-pub fn add(left: u64, right: u64) -> u64 {
-    left + right
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn it_works() {
-        let result = add(2, 2);
-        assert_eq!(result, 4);
-    }
-}
+pub mod models;
blob - /dev/null
blob + 702d54175cb3b9f0805b4bd8801179c3b57757ad (mode 644)
--- /dev/null
+++ eur-lex-scrapper/src/models/mod.rs
@@ -0,0 +1 @@
+pub mod recitals;
blob - /dev/null
blob + f2c7cba5aa1d41dd3e991ed749d15af6510966db (mode 644)
--- /dev/null
+++ eur-lex-scrapper/src/models/recitals.rs
@@ -0,0 +1,148 @@
+use nanohtml2text::html2text;
+use scraper::{Html, Selector};
+use thiserror::Error;
+
+#[derive(Error, Debug)]
+pub enum RecitalParsingError {
+    #[error("failed extracting number")]
+    ExtractingNumber,
+    #[error("failed extracting text")]
+    ExtractingText,
+}
+
+pub fn sanatize(mut text: String) -> String {
+    // Remove possible artefacts
+    text = text.replace(" .", ".");
+    text = text.replace(" ,", ",");
+    text = text.replace("  ", " ");
+    text.to_string()
+}
+
+#[derive(Debug, Default)]
+pub struct Recital {
+    pub number: i32,
+    pub text: String,
+}
+
+impl Recital {
+    pub fn from_str(html: &str) -> Result<Self, RecitalParsingError> {
+        let mut recital = Recital::default();
+
+        // //////////////////
+        // Get recital number
+        //
+        let fragment = Html::parse_fragment(html);
+        let selector_content = Selector::parse(".oj-normal").unwrap();
+        let mut frag = fragment.select(&selector_content);
+        let number = match frag.next() {
+            Some(number) => number,
+            None => return Err(RecitalParsingError::ExtractingNumber),
+        };
+        let number = number
+            .inner_html()
+            .trim_matches('(')
+            .trim_matches(')')
+            .to_string();
+        if let Ok(number) = number.parse::<i32>() {
+            recital.number = number;
+        };
+
+        // ///////////////////
+        // Get recital content
+        //
+        let text = match frag.next() {
+            Some(text) => text,
+            None => return Err(RecitalParsingError::ExtractingText),
+        };
+
+        let mut text_html = text.inner_html();
+        // Remove href citations from html code
+        for citation in text.child_elements() {
+            text_html = text_html.replace(&citation.html(), "");
+        }
+        text_html = sanatize(text_html);
+        recital.text = html2text(&text_html);
+
+        Ok(recital)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn get_recital_html() -> String {
+        let recital_html = r#"
+            <div class="eli-subdivision" id="rct_28">
+                <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                    <colgroup>
+                        <col width="4%">
+                        <col width="96%">
+                    </colgroup>
+                    <tbody>
+                        <tr>
+                            <td valign="top">
+                                <p class="oj-normal">(28)</p>
+                            </td>
+                            <td valign="top">
+                                <p class="oj-normal">Aside from the many beneficial uses of AI, it can also be misused and provide novel and powerful tools for manipulative, exploitative and social control practices. Such practices are particularly harmful and abusive and should be prohibited because they contradict Union values of respect for human dignity, freedom, equality, democracy and the rule of law and fundamental rights enshrined in the Charter, including the right to non-discrimination, to data protection and to privacy and the rights of the child.</p>
+                            </td>
+                        </tr>
+                    </tbody>
+                </table>
+            </div>
+            "#;
+        recital_html.into()
+    }
+
+    fn get_recital_html_bis() -> String {
+        let recital_html = r#"
+        <div class="eli-subdivision" id="rct_9">
+            <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                <colgroup>
+                    <col width="4%">
+                    <col width="96%">
+                </colgroup>
+                <tbody>
+                    <tr>
+                        <td valign="top">
+                            <p class="oj-normal">(9)</p>
+                        </td>
+                        <td valign="top">
+                            <p class="oj-normal">Harmonised rules applicable to the placing on the market, the putting into service and the use of high-risk AI systems should be laid down consistently with Regulation (EC)&nbsp;No&nbsp;765/2008 of the European Parliament and of the Council&nbsp;<a id="ntc7-L_202401689EN.000101-E0007" href="\#ntr7-L_202401689EN.000101-E0007">(<span class="oj-super oj-note-tag">7</span>)</a>, Decision&nbsp;No&nbsp;768/2008/EC of the European Parliament and of the Council&nbsp;<a id="ntc8-L_202401689EN.000101-E0008" href="\#ntr8-L_202401689EN.000101-E0008">(<span class="oj-super oj-note-tag">8</span>)</a> and Regulation&nbsp;(EU) 2019/1020 of the European Parliament and of the Council&nbsp;<a id="ntc9-L_202401689EN.000101-E0009" href="\#ntr9-L_202401689EN.000101-E0009">(<span class="oj-super oj-note-tag">9</span>)</a> (New Legislative Framework).</p>
+                        </td>
+                     </tr>
+                  </tbody>
+               </table>
+            </div>  
+    "#;
+        recital_html.into()
+    }
+
+    #[test]
+    fn recital_parsing_text() {
+        let text = "Aside from the many beneficial uses of AI, it can also be misused and provide novel and powerful tools for manipulative, exploitative and social control practices. Such practices are particularly harmful and abusive and should be prohibited because they contradict Union values of respect for human dignity, freedom, equality, democracy and the rule of law and fundamental rights enshrined in the Charter, including the right to non-discrimination, to data protection and to privacy and the rights of the child.";
+        let html = get_recital_html();
+        let recital = Recital::from_str(&html).unwrap();
+        assert_eq!(recital.text, text.to_string());
+    }
+    #[test]
+    fn recital_parsing_number() {
+        let html = get_recital_html();
+        let recital = Recital::from_str(&html).unwrap();
+        assert_eq!(recital.number, 28);
+    }
+    #[test]
+    fn recital_parsing_text_with_citation() {
+        let text = "Harmonised rules applicable to the placing on the market, the putting into service and the use of high-risk AI systems should be laid down consistently with Regulation (EC)\u{a0}No\u{a0}765/2008 of the European Parliament and of the Council, Decision\u{a0}No\u{a0}768/2008/EC of the European Parliament and of the Council and Regulation\u{a0}(EU) 2019/1020 of the European Parliament and of the Council (New Legislative Framework).";
+        let html = get_recital_html_bis();
+        let recital = Recital::from_str(&html).unwrap();
+        assert_eq!(recital.text, text.to_string());
+    }
+    #[test]
+    fn recital_parsing_number_with_citation() {
+        let html = get_recital_html_bis();
+        let recital = Recital::from_str(&html).unwrap();
+        assert_eq!(recital.number, 9);
+    }
+}