commit f3bb73ef08e4bd67cf31d05f22684185c2602d24 from: Romain VINCENT date: Fri Jan 16 07:30:25 2026 UTC More consistent parser module naming. commit - d0d0a0fa30a225f99d9367e5bf1187f047186535 commit + f3bb73ef08e4bd67cf31d05f22684185c2602d24 blob - 7d46b94eb55a4fbda29dfe7c36b0ba9c9d3aa1ca (mode 644) blob + /dev/null --- eur-lex-scraper/src/parsers/act_parser.rs +++ /dev/null @@ -1,71 +0,0 @@ -use crate::models::acts::EUAct; -use crate::parsers::act_title_parser::{EUActTileParser, EUActTitleParserError}; -use crate::parsers::preamble_parser::PreambleParser; -use scraper::{Html, Selector}; -use thiserror::Error; - -#[derive(Error, Debug)] -pub enum EUActParserError { - #[error("error while parsing title: {0}")] - TitleError(EUActTitleParserError), -} - -impl From for EUActParserError { - fn from(value: EUActTitleParserError) -> Self { - EUActParserError::TitleError(value) - } -} - -pub struct EUActParser {} - -impl EUActParser { - pub fn parse(html: &str) -> Result { - let act_html = Html::parse_document(html); - - // ////////////// - // Get act title - let title_selector = Selector::parse(".eli-main-title").unwrap(); - let title_element = act_html.select(&title_selector).next().unwrap(); - let title = EUActTileParser::parse(title_element)?; - - // ///////////// - // Get preamble - let preamble_selector = Selector::parse("#pbl_1").unwrap(); - let preamble_section = act_html.select(&preamble_selector).next().unwrap(); - let preamble = PreambleParser::parse(preamble_section).unwrap(); - - Ok(EUAct { title, preamble }) - } -} - -#[cfg(test)] -mod tests { - use std::fs; - - use super::*; - - fn get_act_html_simple() -> String { - fs::read_to_string("data/test_act_simple.html").unwrap() - } - - fn get_act_html() -> String { - fs::read_to_string("data/EU_2024_01689.html").unwrap() - } - - fn get_act_title() -> String { - "REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL of 13\u{a0}June 2024 laying down harmonised rules on artificial intelligence and amending Regulations (EC) No\u{a0}300/2008, (EU) No\u{a0}167/2013, (EU) No\u{a0}168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and Directives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)(Text with EEA relevance)".to_string() - } - - #[test] - fn act_parsing_title_html_simple() { - let title = get_act_title(); - let act = EUActParser::parse(&get_act_html_simple()).unwrap(); - assert_eq!(act.title, title); - } - #[test] - fn act_parsing_title_html() { - let title = get_act_title(); - let act = EUActParser::parse(&get_act_html()).unwrap(); - assert_eq!(act.title, title); - } -} blob - /dev/null blob + 61f872aec5803980ed69405184f3191d255ff1cc (mode 644) --- /dev/null +++ eur-lex-scraper/src/parsers/act.rs @@ -0,0 +1,74 @@ +use crate::models::acts::EUAct; +use crate::parsers::{ + act_title::{EUActTileParser, EUActTitleParserError}, + preamble::PreambleParser, +}; + +use scraper::{Html, Selector}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum EUActParserError { + #[error("error while parsing title: {0}")] + TitleError(EUActTitleParserError), +} + +impl From for EUActParserError { + fn from(value: EUActTitleParserError) -> Self { + EUActParserError::TitleError(value) + } +} + +pub struct EUActParser {} + +impl EUActParser { + pub fn parse(html: &str) -> Result { + let act_html = Html::parse_document(html); + + // ////////////// + // Get act title + let title_selector = Selector::parse(".eli-main-title").unwrap(); + let title_element = act_html.select(&title_selector).next().unwrap(); + let title = EUActTileParser::parse(title_element)?; + + // ///////////// + // Get preamble + let preamble_selector = Selector::parse("#pbl_1").unwrap(); + let preamble_section = act_html.select(&preamble_selector).next().unwrap(); + let preamble = PreambleParser::parse(preamble_section).unwrap(); + + Ok(EUAct { title, preamble }) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::*; + + fn get_act_html_simple() -> String { + fs::read_to_string("data/test_act_simple.html").unwrap() + } + + fn get_act_html() -> String { + fs::read_to_string("data/EU_2024_01689.html").unwrap() + } + + fn get_act_title() -> String { + "REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL of 13\u{a0}June 2024 laying down harmonised rules on artificial intelligence and amending Regulations (EC) No\u{a0}300/2008, (EU) No\u{a0}167/2013, (EU) No\u{a0}168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and Directives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)(Text with EEA relevance)".to_string() + } + + #[test] + fn act_parsing_title_html_simple() { + let title = get_act_title(); + let act = EUActParser::parse(&get_act_html_simple()).unwrap(); + assert_eq!(act.title, title); + } + #[test] + fn act_parsing_title_html() { + let title = get_act_title(); + let act = EUActParser::parse(&get_act_html()).unwrap(); + assert_eq!(act.title, title); + } +} blob - 0bff7ef487c062276c514afa7897c63cf440e898 (mode 644) blob + /dev/null --- eur-lex-scraper/src/parsers/act_title_parser.rs +++ /dev/null @@ -1,95 +0,0 @@ -use crate::models::{acts::EUAct, preambles::Preamble}; -use nanohtml2text::html2text; -use scraper::{ElementRef, Html, Selector}; -use thiserror::Error; - -#[derive(Error, Debug)] -pub enum EUActTitleParserError { - #[error("title not found")] - NoTitleFound, - #[error("empty title")] - EmptyTitle, -} - -pub(crate) struct EUActTileParser {} - -impl EUActTileParser { - pub(crate) fn parse(element: ElementRef) -> Result { - let title_parts_select = Selector::parse(".oj-doc-ti").unwrap(); - let elements = element.select(&title_parts_select); - let mut title_vec: Vec = Vec::new(); - for el in elements { - title_vec.push(html2text(&el.html())); - } - if title_vec.is_empty() { - return Err(EUActTitleParserError::NoTitleFound); - } - let title = title_vec.join(""); - if title.is_empty() { - return Err(EUActTitleParserError::EmptyTitle); - } - Ok(title.trim().to_string()) - } -} - -#[derive(Error, Debug)] -pub enum EUActParserError { - #[error("error while parsing title: {0}")] - TitleError(EUActTitleParserError), -} - -impl From for EUActParserError { - fn from(value: EUActTitleParserError) -> Self { - EUActParserError::TitleError(value) - } -} - -pub struct EUActParser {} - -impl EUActParser { - pub fn parse(html: &str) -> Result { - let act_html = Html::parse_document(html); - - // ////////////// - // Get act title - let title_selector = Selector::parse(".eli-main-title").unwrap(); - let title_element = act_html.select(&title_selector).next().unwrap(); - let title = EUActTileParser::parse(title_element)?; - Ok(EUAct { - title, - preamble: Preamble::default(), - }) - } -} - -#[cfg(test)] -mod tests { - use std::fs; - - use super::*; - - fn get_act_html_simple() -> String { - fs::read_to_string("data/test_act_simple.html").unwrap() - } - - fn get_act_html() -> String { - fs::read_to_string("data/EU_2024_01689.html").unwrap() - } - - fn get_act_title() -> String { - "REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL of 13\u{a0}June 2024 laying down harmonised rules on artificial intelligence and amending Regulations (EC) No\u{a0}300/2008, (EU) No\u{a0}167/2013, (EU) No\u{a0}168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and Directives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)(Text with EEA relevance)".to_string() - } - - #[test] - fn act_parsing_title_html_simple() { - let title = get_act_title(); - let act = EUActParser::parse(&get_act_html_simple()).unwrap(); - assert_eq!(act.title, title); - } - #[test] - fn act_parsing_title_html() { - let title = get_act_title(); - let act = EUActParser::parse(&get_act_html()).unwrap(); - assert_eq!(act.title, title); - } -} blob - /dev/null blob + 0bff7ef487c062276c514afa7897c63cf440e898 (mode 644) --- /dev/null +++ eur-lex-scraper/src/parsers/act_title.rs @@ -0,0 +1,95 @@ +use crate::models::{acts::EUAct, preambles::Preamble}; +use nanohtml2text::html2text; +use scraper::{ElementRef, Html, Selector}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum EUActTitleParserError { + #[error("title not found")] + NoTitleFound, + #[error("empty title")] + EmptyTitle, +} + +pub(crate) struct EUActTileParser {} + +impl EUActTileParser { + pub(crate) fn parse(element: ElementRef) -> Result { + let title_parts_select = Selector::parse(".oj-doc-ti").unwrap(); + let elements = element.select(&title_parts_select); + let mut title_vec: Vec = Vec::new(); + for el in elements { + title_vec.push(html2text(&el.html())); + } + if title_vec.is_empty() { + return Err(EUActTitleParserError::NoTitleFound); + } + let title = title_vec.join(""); + if title.is_empty() { + return Err(EUActTitleParserError::EmptyTitle); + } + Ok(title.trim().to_string()) + } +} + +#[derive(Error, Debug)] +pub enum EUActParserError { + #[error("error while parsing title: {0}")] + TitleError(EUActTitleParserError), +} + +impl From for EUActParserError { + fn from(value: EUActTitleParserError) -> Self { + EUActParserError::TitleError(value) + } +} + +pub struct EUActParser {} + +impl EUActParser { + pub fn parse(html: &str) -> Result { + let act_html = Html::parse_document(html); + + // ////////////// + // Get act title + let title_selector = Selector::parse(".eli-main-title").unwrap(); + let title_element = act_html.select(&title_selector).next().unwrap(); + let title = EUActTileParser::parse(title_element)?; + Ok(EUAct { + title, + preamble: Preamble::default(), + }) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::*; + + fn get_act_html_simple() -> String { + fs::read_to_string("data/test_act_simple.html").unwrap() + } + + fn get_act_html() -> String { + fs::read_to_string("data/EU_2024_01689.html").unwrap() + } + + fn get_act_title() -> String { + "REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL of 13\u{a0}June 2024 laying down harmonised rules on artificial intelligence and amending Regulations (EC) No\u{a0}300/2008, (EU) No\u{a0}167/2013, (EU) No\u{a0}168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and Directives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)(Text with EEA relevance)".to_string() + } + + #[test] + fn act_parsing_title_html_simple() { + let title = get_act_title(); + let act = EUActParser::parse(&get_act_html_simple()).unwrap(); + assert_eq!(act.title, title); + } + #[test] + fn act_parsing_title_html() { + let title = get_act_title(); + let act = EUActParser::parse(&get_act_html()).unwrap(); + assert_eq!(act.title, title); + } +} blob - d015548f1560d9974dd899631dbba2f779260023 (mode 644) blob + /dev/null --- eur-lex-scraper/src/parsers/enacting_terms_parser.rs +++ /dev/null @@ -1,53 +0,0 @@ -use scraper::{ElementRef, Selector}; -use thiserror::Error; - -use crate::models::enacting_terms::EnactingTerms; -use crate::parsers::article::{ArticleParser, ArticleParserError}; -use crate::parsers::chapter::{ChapterParser, ChapterParserError}; - -pub struct EnactingTermParser {} - -#[derive(Error, Debug, PartialEq, PartialOrd)] -pub enum EnactingTermParserError { - #[error("error while parsing enacting term")] - GenericError, - #[error("error while parsing chapter")] - ChapterError(ChapterParserError), - #[error("error while parsing aricles")] - ArticleError(ArticleParserError), -} - -impl From for EnactingTermParserError { - fn from(value: ChapterParserError) -> Self { - EnactingTermParserError::ChapterError(value) - } -} - -impl From for EnactingTermParserError { - fn from(value: ArticleParserError) -> Self { - EnactingTermParserError::ArticleError(value) - } -} - -impl EnactingTermParser { - pub fn parse(element: ElementRef) -> Result { - let mut enacting_terms = EnactingTerms::default(); - let chapter_selector = Selector::parse("[id^=cpt_]").unwrap(); - let chapter_count = element.select(&chapter_selector).count(); - if chapter_count > 0 { - for chapter in element.select(&chapter_selector) { - let chapter = ChapterParser::parse(chapter)?; - enacting_terms.push(chapter); - } - } else { - // See following document for document with articles only - // https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32006D0443 - let article_selector = Selector::parse(r#"[id^="art_"]:not([id*=".tit"])"#).unwrap(); - for article in element.select(&article_selector) { - let article = ArticleParser::parse(article)?; - enacting_terms.push(article); - } - } - Ok(enacting_terms) - } -} blob - /dev/null blob + d015548f1560d9974dd899631dbba2f779260023 (mode 644) --- /dev/null +++ eur-lex-scraper/src/parsers/enacting_terms.rs @@ -0,0 +1,53 @@ +use scraper::{ElementRef, Selector}; +use thiserror::Error; + +use crate::models::enacting_terms::EnactingTerms; +use crate::parsers::article::{ArticleParser, ArticleParserError}; +use crate::parsers::chapter::{ChapterParser, ChapterParserError}; + +pub struct EnactingTermParser {} + +#[derive(Error, Debug, PartialEq, PartialOrd)] +pub enum EnactingTermParserError { + #[error("error while parsing enacting term")] + GenericError, + #[error("error while parsing chapter")] + ChapterError(ChapterParserError), + #[error("error while parsing aricles")] + ArticleError(ArticleParserError), +} + +impl From for EnactingTermParserError { + fn from(value: ChapterParserError) -> Self { + EnactingTermParserError::ChapterError(value) + } +} + +impl From for EnactingTermParserError { + fn from(value: ArticleParserError) -> Self { + EnactingTermParserError::ArticleError(value) + } +} + +impl EnactingTermParser { + pub fn parse(element: ElementRef) -> Result { + let mut enacting_terms = EnactingTerms::default(); + let chapter_selector = Selector::parse("[id^=cpt_]").unwrap(); + let chapter_count = element.select(&chapter_selector).count(); + if chapter_count > 0 { + for chapter in element.select(&chapter_selector) { + let chapter = ChapterParser::parse(chapter)?; + enacting_terms.push(chapter); + } + } else { + // See following document for document with articles only + // https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32006D0443 + let article_selector = Selector::parse(r#"[id^="art_"]:not([id*=".tit"])"#).unwrap(); + for article in element.select(&article_selector) { + let article = ArticleParser::parse(article)?; + enacting_terms.push(article); + } + } + Ok(enacting_terms) + } +} blob - 4d1edacb2dad408c93395f9ed037c4148b25711c blob + 9fd6ae5e4c7d7cfb50def5eaec2d5c4224486a24 --- eur-lex-scraper/src/parsers/mod.rs +++ eur-lex-scraper/src/parsers/mod.rs @@ -1,8 +1,8 @@ -pub mod act_parser; -pub mod act_title_parser; +pub mod act; +pub mod act_title; pub mod article; pub mod chapter; -pub mod enacting_terms_parser; -pub mod preamble_item_parser; -pub mod preamble_parser; +pub mod enacting_terms; +pub mod preamble; +pub mod preamble_item; pub mod section; blob - 935da0d6da1e8942530d31d5f46dfc6d1d995c3d (mode 644) blob + /dev/null --- eur-lex-scraper/src/parsers/preamble_item_parser.rs +++ /dev/null @@ -1,172 +0,0 @@ -use crate::models::preambles::{ItemType, PreambleItem}; -use nanohtml2text::html2text; -use scraper::{ElementRef, Selector}; -use thiserror::Error; - -#[derive(Error, Debug, PartialEq, PartialOrd)] -pub enum PreambleItemParserError { - #[error("error while parsing content")] - NoContent, - #[error("error while parsing number")] - ErrorNumber, - #[error("item type error")] - ItemTypeError, -} - -pub struct PreambleItemParser {} - -impl PreambleItemParser { - pub fn parse(element: ElementRef) -> Result { - let id = match element.attr("id") { - Some(id) => id.to_string(), - None => return Err(PreambleItemParserError::ErrorNumber), - }; - let mut item_type = id.clone(); - item_type.truncate(3); - let item_type = match item_type.as_str() { - "rct" => ItemType::Recital, - "cit" => ItemType::Citation, - _ => return Err(PreambleItemParserError::ItemTypeError), - }; - let number_str = match item_type { - ItemType::Citation => id.replace("cit_", ""), - ItemType::Recital => id.replace("rct_", ""), - }; - let number: u32 = match number_str.parse() { - Ok(number) => number, - Err(_) => return Err(PreambleItemParserError::ErrorNumber), - }; - let selector = Selector::parse(".oj-normal").unwrap(); - let mut content: Vec = Vec::new(); - for element in element.select(&selector) { - content.push(html2text(&element.html())) - } - let content = content.join(" "); - if content.is_empty() { - return Err(PreambleItemParserError::NoContent); - } - Ok(PreambleItem { - number, - item_type, - content, - }) - } -} -#[cfg(test)] -mod tests { - - use super::*; - use scraper::{Html, Selector}; - - fn get_citation_html() -> String { - let article_html = r#" -
-

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

-
-

Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,

-
-
- "#; - article_html.to_string() - } - - fn get_recital_html() -> String { - let article_html = r#" -
-

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

-
-

Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,

-
-

Whereas:

-
- - - - - - - - - -
-

(1)

-
-

The purpose of this Regulation is to improve the functioning of the internal market by laying down a uniform legal framework in particular for the development, the placing on the market, the putting into service and the use of artificial intelligence systems (AI systems) in the Union, in accordance with Union values, to promote the uptake of human centric and trustworthy artificial intelligence (AI) while ensuring a high level of protection of health, safety, fundamental rights as enshrined in the Charter of Fundamental Rights of the European Union (the ‘Charter’), including democracy, the rule of law and environmental protection, to protect against the harmful effects of AI systems in the Union, and to support innovation. This Regulation ensures the free movement, cross-border, of AI-based goods and services, thus preventing Member States from imposing restrictions on the development, marketing and use of AI systems, unless explicitly authorised by this Regulation.

-
-
-
- "#; - article_html.to_string() - } - fn get_citation_html_no_number() -> String { - let article_html = r#" -
-

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

-
-

Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,

-
-
- "#; - article_html.to_string() - } - - fn get_citation_html_no_content() -> String { - let article_html = r#" -
-

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

-
-
-
- "#; - article_html.to_string() - } - - #[test] - fn item_parsing_citation() { - let html = Html::parse_fragment(&get_citation_html()); - let selector = Selector::parse("[id^=cit_]").unwrap(); - let element_ref = html.select(&selector).next().unwrap(); - let citation_left = PreambleItemParser::parse(element_ref).unwrap(); - let citation_right = PreambleItem { - number: 1, - item_type: ItemType::Citation, - content: String::from( - "Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,", - ), - }; - assert_eq!(citation_left, citation_right) - } - #[test] - fn item_parsing_recital() { - let html = Html::parse_fragment(&get_recital_html()); - let selector = Selector::parse("[id^=rct_]").unwrap(); - let element_ref = html.select(&selector).next().unwrap(); - let citation_left = PreambleItemParser::parse(element_ref).unwrap(); - let citation_right = PreambleItem { - number: 1, - item_type: ItemType::Recital, - content: String::from( - "(1) The purpose of this Regulation is to improve the functioning of the internal market by laying down a uniform legal framework in particular for the development, the placing on the market, the putting into service and the use of artificial intelligence systems (AI systems) in the Union, in accordance with Union values, to promote the uptake of human centric and trustworthy artificial intelligence (AI) while ensuring a high level of protection of health, safety, fundamental rights as enshrined in the Charter of Fundamental Rights of the European Union (the ‘Charter’), including democracy, the rule of law and environmental protection, to protect against the harmful effects of AI systems in the Union, and to support innovation. This Regulation ensures the free movement, cross-border, of AI-based goods and services, thus preventing Member States from imposing restrictions on the development, marketing and use of AI systems, unless explicitly authorised by this Regulation.", - ), - }; - assert_eq!(citation_left, citation_right) - } - #[test] - fn item_parsing_citation_no_number() { - let html = Html::parse_fragment(&get_citation_html_no_number()); - let selector = Selector::parse("[id^=cit_]").unwrap(); - let element_ref = html.select(&selector).next().unwrap(); - let citation_left = PreambleItemParser::parse(element_ref).unwrap_err(); - let citation_right = PreambleItemParserError::ErrorNumber; - assert_eq!(citation_left, citation_right); - } - #[test] - fn item_parsing_citation_no_content() { - let html = Html::parse_fragment(&get_citation_html_no_content()); - let selector = Selector::parse("[id^=cit_]").unwrap(); - let element_ref = html.select(&selector).next().unwrap(); - let citation_left = PreambleItemParser::parse(element_ref).unwrap_err(); - let citation_right = PreambleItemParserError::NoContent; - assert_eq!(citation_left, citation_right); - } -} blob - /dev/null blob + 3c30e7af02c402f0c5ce8a14ab40d290d67a913c (mode 644) --- /dev/null +++ eur-lex-scraper/src/parsers/preamble.rs @@ -0,0 +1,40 @@ +use scraper::{ElementRef, Selector}; +use thiserror::Error; + +use crate::{ + models::preambles::Preamble, + parsers::preamble_item::{PreambleItemParser, PreambleItemParserError}, +}; + +#[derive(Error, Debug, PartialEq, PartialOrd)] +pub enum PreambleParserError { + #[error("error while parsing preamble item: {0}")] + ItemParserError(PreambleItemParserError), +} + +impl From for PreambleParserError { + fn from(value: PreambleItemParserError) -> Self { + PreambleParserError::ItemParserError(value) + } +} + +pub struct PreambleParser {} + +impl PreambleParser { + pub fn parse(element: ElementRef) -> Result { + let mut preamble = Preamble::default(); + // Citations + let citation_selector = Selector::parse("[id^=cit_]").unwrap(); + for citation in element.select(&citation_selector) { + let citation = PreambleItemParser::parse(citation)?; + preamble.add_item(citation); + } + // Recitals + let recital_selector = Selector::parse("[id^=rct_]").unwrap(); + for recital in element.select(&recital_selector) { + let recital = PreambleItemParser::parse(recital)?; + preamble.add_item(recital); + } + Ok(preamble) + } +} blob - 8e53c9a873c435684949f3c24007bf0ca4ffc0d9 (mode 644) blob + /dev/null --- eur-lex-scraper/src/parsers/preamble_parser.rs +++ /dev/null @@ -1,40 +0,0 @@ -use scraper::{ElementRef, Selector}; -use thiserror::Error; - -use crate::{ - models::preambles::Preamble, - parsers::preamble_item_parser::{PreambleItemParser, PreambleItemParserError}, -}; - -#[derive(Error, Debug, PartialEq, PartialOrd)] -pub enum PreambleParserError { - #[error("error while parsing preamble item: {0}")] - ItemParserError(PreambleItemParserError), -} - -impl From for PreambleParserError { - fn from(value: PreambleItemParserError) -> Self { - PreambleParserError::ItemParserError(value) - } -} - -pub struct PreambleParser {} - -impl PreambleParser { - pub fn parse(element: ElementRef) -> Result { - let mut preamble = Preamble::default(); - // Citations - let citation_selector = Selector::parse("[id^=cit_]").unwrap(); - for citation in element.select(&citation_selector) { - let citation = PreambleItemParser::parse(citation)?; - preamble.add_item(citation); - } - // Recitals - let recital_selector = Selector::parse("[id^=rct_]").unwrap(); - for recital in element.select(&recital_selector) { - let recital = PreambleItemParser::parse(recital)?; - preamble.add_item(recital); - } - Ok(preamble) - } -} blob - /dev/null blob + 935da0d6da1e8942530d31d5f46dfc6d1d995c3d (mode 644) --- /dev/null +++ eur-lex-scraper/src/parsers/preamble_item.rs @@ -0,0 +1,172 @@ +use crate::models::preambles::{ItemType, PreambleItem}; +use nanohtml2text::html2text; +use scraper::{ElementRef, Selector}; +use thiserror::Error; + +#[derive(Error, Debug, PartialEq, PartialOrd)] +pub enum PreambleItemParserError { + #[error("error while parsing content")] + NoContent, + #[error("error while parsing number")] + ErrorNumber, + #[error("item type error")] + ItemTypeError, +} + +pub struct PreambleItemParser {} + +impl PreambleItemParser { + pub fn parse(element: ElementRef) -> Result { + let id = match element.attr("id") { + Some(id) => id.to_string(), + None => return Err(PreambleItemParserError::ErrorNumber), + }; + let mut item_type = id.clone(); + item_type.truncate(3); + let item_type = match item_type.as_str() { + "rct" => ItemType::Recital, + "cit" => ItemType::Citation, + _ => return Err(PreambleItemParserError::ItemTypeError), + }; + let number_str = match item_type { + ItemType::Citation => id.replace("cit_", ""), + ItemType::Recital => id.replace("rct_", ""), + }; + let number: u32 = match number_str.parse() { + Ok(number) => number, + Err(_) => return Err(PreambleItemParserError::ErrorNumber), + }; + let selector = Selector::parse(".oj-normal").unwrap(); + let mut content: Vec = Vec::new(); + for element in element.select(&selector) { + content.push(html2text(&element.html())) + } + let content = content.join(" "); + if content.is_empty() { + return Err(PreambleItemParserError::NoContent); + } + Ok(PreambleItem { + number, + item_type, + content, + }) + } +} +#[cfg(test)] +mod tests { + + use super::*; + use scraper::{Html, Selector}; + + fn get_citation_html() -> String { + let article_html = r#" +
+

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

+
+

Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,

+
+
+ "#; + article_html.to_string() + } + + fn get_recital_html() -> String { + let article_html = r#" +
+

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

+
+

Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,

+
+

Whereas:

+
+ + + + + + + + + +
+

(1)

+
+

The purpose of this Regulation is to improve the functioning of the internal market by laying down a uniform legal framework in particular for the development, the placing on the market, the putting into service and the use of artificial intelligence systems (AI systems) in the Union, in accordance with Union values, to promote the uptake of human centric and trustworthy artificial intelligence (AI) while ensuring a high level of protection of health, safety, fundamental rights as enshrined in the Charter of Fundamental Rights of the European Union (the ‘Charter’), including democracy, the rule of law and environmental protection, to protect against the harmful effects of AI systems in the Union, and to support innovation. This Regulation ensures the free movement, cross-border, of AI-based goods and services, thus preventing Member States from imposing restrictions on the development, marketing and use of AI systems, unless explicitly authorised by this Regulation.

+
+
+
+ "#; + article_html.to_string() + } + fn get_citation_html_no_number() -> String { + let article_html = r#" +
+

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

+
+

Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,

+
+
+ "#; + article_html.to_string() + } + + fn get_citation_html_no_content() -> String { + let article_html = r#" +
+

THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,

+
+
+
+ "#; + article_html.to_string() + } + + #[test] + fn item_parsing_citation() { + let html = Html::parse_fragment(&get_citation_html()); + let selector = Selector::parse("[id^=cit_]").unwrap(); + let element_ref = html.select(&selector).next().unwrap(); + let citation_left = PreambleItemParser::parse(element_ref).unwrap(); + let citation_right = PreambleItem { + number: 1, + item_type: ItemType::Citation, + content: String::from( + "Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,", + ), + }; + assert_eq!(citation_left, citation_right) + } + #[test] + fn item_parsing_recital() { + let html = Html::parse_fragment(&get_recital_html()); + let selector = Selector::parse("[id^=rct_]").unwrap(); + let element_ref = html.select(&selector).next().unwrap(); + let citation_left = PreambleItemParser::parse(element_ref).unwrap(); + let citation_right = PreambleItem { + number: 1, + item_type: ItemType::Recital, + content: String::from( + "(1) The purpose of this Regulation is to improve the functioning of the internal market by laying down a uniform legal framework in particular for the development, the placing on the market, the putting into service and the use of artificial intelligence systems (AI systems) in the Union, in accordance with Union values, to promote the uptake of human centric and trustworthy artificial intelligence (AI) while ensuring a high level of protection of health, safety, fundamental rights as enshrined in the Charter of Fundamental Rights of the European Union (the ‘Charter’), including democracy, the rule of law and environmental protection, to protect against the harmful effects of AI systems in the Union, and to support innovation. This Regulation ensures the free movement, cross-border, of AI-based goods and services, thus preventing Member States from imposing restrictions on the development, marketing and use of AI systems, unless explicitly authorised by this Regulation.", + ), + }; + assert_eq!(citation_left, citation_right) + } + #[test] + fn item_parsing_citation_no_number() { + let html = Html::parse_fragment(&get_citation_html_no_number()); + let selector = Selector::parse("[id^=cit_]").unwrap(); + let element_ref = html.select(&selector).next().unwrap(); + let citation_left = PreambleItemParser::parse(element_ref).unwrap_err(); + let citation_right = PreambleItemParserError::ErrorNumber; + assert_eq!(citation_left, citation_right); + } + #[test] + fn item_parsing_citation_no_content() { + let html = Html::parse_fragment(&get_citation_html_no_content()); + let selector = Selector::parse("[id^=cit_]").unwrap(); + let element_ref = html.select(&selector).next().unwrap(); + let citation_left = PreambleItemParser::parse(element_ref).unwrap_err(); + let citation_right = PreambleItemParserError::NoContent; + assert_eq!(citation_left, citation_right); + } +}