Commit Diff


commit - d30b6c2cbca773270d71c6bc757ff036cbcf1188
commit + 694e0d6dd58ac36b8cf8b91050520056ffaaedc2
blob - 702d54175cb3b9f0805b4bd8801179c3b57757ad
blob + be494ba23f06c54373ab10b3d89190d4e0aee110
--- eur-lex-scrapper/src/models/mod.rs
+++ eur-lex-scrapper/src/models/mod.rs
@@ -1 +1,2 @@
+pub mod articles;
 pub mod recitals;
blob - /dev/null
blob + 23b5be81244ca1cdde328d17faaef13c2173c03e (mode 644)
--- /dev/null
+++ eur-lex-scrapper/src/models/articles.rs
@@ -0,0 +1,289 @@
+use scraper::{Html, Selector};
+use thiserror::Error;
+
+#[derive(Default, Debug)]
+pub enum Rank {
+    #[default]
+    NoRank,
+    Preamble,
+    Number(i32),
+    Letter(char),
+}
+
+#[derive(Default, Debug)]
+pub struct ArticleParagraph {
+    pub rank: Rank,
+    pub text: String,
+}
+
+#[derive(Debug)]
+pub enum ArticleItem {
+    Paragraph(ArticleParagraph),
+    List(Vec<ArticleParagraph>),
+}
+
+impl Default for ArticleItem {
+    fn default() -> Self {
+        ArticleItem::Paragraph(ArticleParagraph::default())
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct Article {
+    pub number: i32,
+    pub title: String,
+    pub items: Vec<ArticleItem>,
+}
+
+#[derive(Error, Debug)]
+pub enum ArticleParsingError {
+    #[error("failed extracting number")]
+    ExtractingNumber,
+    #[error("failed extracting title")]
+    ExtractingTitle,
+}
+
+impl Article {
+    pub fn from_str(html: &str) -> Result<Self, ArticleParsingError> {
+        let mut article = Article::default();
+        let fragment = Html::parse_fragment(html);
+
+        // /////////////////
+        // Get article title
+        //
+        let selector_content = Selector::parse(".oj-sti-art").unwrap();
+        let mut frag = fragment.select(&selector_content);
+        let title = match frag.next() {
+            Some(title) => title.inner_html(),
+            None => return Err(ArticleParsingError::ExtractingTitle),
+        };
+        article.title = title;
+
+        // //////////////////
+        // Get article number
+        //
+        let selector_content = Selector::parse(".oj-ti-art").unwrap();
+        let mut frag = fragment.select(&selector_content);
+        let number = match frag.next() {
+            Some(number) => number.inner_html().replace("Article&nbsp;", ""),
+            None => return Err(ArticleParsingError::ExtractingTitle),
+        };
+        let number = match number.parse::<i32>() {
+            Ok(number) => number,
+            Err(_) => return Err(ArticleParsingError::ExtractingNumber),
+        };
+        article.number = number;
+
+        // Next step is to check if a div exists.
+        // If it exists, try load the content
+        // If not, there is only on <p> and can be handle directly
+
+        Ok(article)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn get_article_html() -> String {
+        let article_html = r#"
+            <div class="eli-subdivision" id="art_4">
+                <p id="d1e2795-1-1" class="oj-ti-art">Article&nbsp;4</p>
+                <div class="eli-title" id="art_4.tit_1">
+                    <p class="oj-sti-art">AI literacy</p>
+                </div>
+                <p class="oj-normal">Providers and deployers of AI systems shall take measures to ensure, to their best extent, a&nbsp;sufficient level of AI literacy of their staff and other persons dealing with the operation and use of AI systems on their behalf, taking into account their technical knowledge, experience, education and training and the context the AI systems are to be used in, and considering the persons or groups of persons on whom the AI systems are to be used.</p>
+            </div>
+        "#;
+        article_html.to_string()
+    }
+
+    fn get_article2_html() -> String {
+        let article_html = r#"
+            <div class="eli-subdivision" id="art_8">
+                <p id="d1e3261-1-1" class="oj-ti-art">Article&nbsp;8</p>
+                <div class="eli-title" id="art_8.tit_1">
+                    <p class="oj-sti-art">Compliance with the requirements</p>
+                </div>
+                <div id="008.001">
+                    <p class="oj-normal">1.&nbsp;&nbsp;&nbsp;High-risk AI systems shall comply with the requirements laid down in this Section, taking into account their intended purpose as well as the generally acknowledged state of the art on AI and AI-related technologies. The risk management system referred to in Article&nbsp;9 shall be taken into account when ensuring compliance with those requirements.</p>
+                </div>
+                <div id="008.002">
+                    <p class="oj-normal">2.&nbsp;&nbsp;&nbsp;Where a&nbsp;product contains an AI system, to which the requirements of this Regulation as well as requirements of the Union harmonisation legislation listed in Section A&nbsp;of Annex&nbsp;I&nbsp;apply, providers shall be responsible for ensuring that their product is fully compliant with all applicable requirements under applicable Union harmonisation legislation. In ensuring the compliance of high-risk AI systems referred to in paragraph&nbsp;1 with the requirements set out in this Section, and in order to ensure consistency, avoid duplication and minimise additional burdens, providers shall have a&nbsp;choice of integrating, as appropriate, the necessary testing and reporting processes, information and documentation they provide with regard to their product into documentation and procedures that already exist and are required under the Union harmonisation legislation listed in Section A&nbsp;of Annex&nbsp;I.</p>
+                </div>
+            </div>
+        "#;
+        article_html.to_string()
+    }
+
+    fn get_article3_html() -> String {
+        let article_html = r#"
+            <div class="eli-subdivision" id="art_12">
+                <p id="d1e3531-1-1" class="oj-ti-art">Article&nbsp;12</p>
+                <div class="eli-title" id="art_12.tit_1">
+                    <p class="oj-sti-art">Record-keeping</p>
+                </div>
+                <div id="012.001">
+                    <p class="oj-normal">1.&nbsp;&nbsp;&nbsp;High-risk AI systems shall technically allow for the automatic recording of events (logs) over the lifetime of the system.</p>
+                </div>
+                <div id="012.002">
+                    <p class="oj-normal">2.&nbsp;&nbsp;&nbsp;In order to ensure a&nbsp;level of traceability of the functioning of a&nbsp;high-risk AI system that is appropriate to the intended purpose of the system, logging capabilities shall enable the recording of events relevant for:</p>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(a)</p>
+                                </td>
+                                <td valign="top">
+                                    <p class="oj-normal">identifying situations that may result in the high-risk AI system presenting a&nbsp;risk within the meaning of Article&nbsp;79(1) or in a&nbsp;substantial modification;</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(b)</p>
+                                </td>
+                                <td valign="top">
+                                    <p class="oj-normal">facilitating the post-market monitoring referred to in Article&nbsp;72; and</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(c)</p>
+                                </td>
+                                <td valign="top">
+                                    <p class="oj-normal">monitoring the operation of high-risk AI systems referred to in Article&nbsp;26(5).</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                </div>
+                <div id="012.003">
+                    <p class="oj-normal">3.&nbsp;&nbsp;&nbsp;For high-risk AI systems referred to in point 1 (a), of Annex&nbsp;III, the logging capabilities shall provide, at a&nbsp;minimum:</p>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(a)</p>
+                                </td>
+                                <td valign="top">
+                                    <p class="oj-normal">recording of the period of each use of the system (start date and time and end date and time of each use);</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(b)</p>
+                                </td>
+                                <td valign="top">
+                                    <p class="oj-normal">the reference database against which input data has been checked by the system;</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(c)</p>
+                                </td>
+                                <td valign="top">
+                                    <p class="oj-normal">the input data for which the search has led to a&nbsp;match;</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                    <table width="100%" border="0" cellspacing="0" cellpadding="0">
+                        <colgroup>
+                            <col width="4%">
+                            <col width="96%">
+                        </colgroup>
+                        <tbody>
+                            <tr>
+                                <td valign="top">
+                                    <p class="oj-normal">(d)</p>
+                                </td>
+                                <td valign="top">
+                                <p class="oj-normal">the identification of the natural persons involved in the verification of the results, as referred to in Article&nbsp;14(5).</p>
+                                </td>
+                            </tr>
+                        </tbody>
+                    </table>
+                </div>
+            </div>
+        "#;
+        article_html.to_string()
+    }
+
+    #[test]
+    fn article_parsing_title() {
+        let html = get_article_html();
+        let recital = Article::from_str(&html).unwrap();
+        assert_eq!(recital.title, "AI literacy");
+    }
+    #[test]
+    fn article_parsing_number() {
+        let html = get_article_html();
+        let recital = Article::from_str(&html).unwrap();
+        assert_eq!(recital.number, 4);
+    }
+    #[test]
+    fn article2_parsing_title() {
+        let html = get_article2_html();
+        let recital = Article::from_str(&html).unwrap();
+        assert_eq!(recital.title, "Compliance with the requirements");
+    }
+    #[test]
+    fn article2_parsing_number() {
+        let html = get_article2_html();
+        let recital = Article::from_str(&html).unwrap();
+        assert_eq!(recital.number, 8);
+    }
+    #[test]
+    fn article3_parsing_title() {
+        let html = get_article3_html();
+        let recital = Article::from_str(&html).unwrap();
+        assert_eq!(recital.title, "Record-keeping");
+    }
+    #[test]
+    fn article3_parsing_number() {
+        let html = get_article3_html();
+        let recital = Article::from_str(&html).unwrap();
+        assert_eq!(recital.number, 12);
+    }
+}