commit 78f7dcfd62590dd043b3efe7fedd36d806be1f21 from: Romain VINCENT date: Sat Jan 17 10:11:19 2026 UTC Add title to Section module. commit - 358471e349765354d6c4aa8172b94e27c2b2515c commit + 78f7dcfd62590dd043b3efe7fedd36d806be1f21 blob - 59939a3bd4f42b8d2691492a71e9f1e56e6fa925 blob + c9fb979d512b95d86f965fe461e65d25d888b000 --- eur-lex-scraper/src/models/section.rs +++ eur-lex-scraper/src/models/section.rs @@ -2,6 +2,7 @@ use crate::models::{articles::Article, enacting_terms: #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct Section { + title: String, items: Vec
, } @@ -21,6 +22,12 @@ impl IntoIterator for Section { } impl Section { + pub fn set_title(&mut self, title: String) { + self.title = title; + } + pub fn get_title(&self) -> &str { + &self.title + } pub fn push(&mut self, article: Article) { self.items.push(article) } blob - 161624a33298dea9b4ce298f8fa8d7975108ed54 blob + 0eaf1d07e47fd85d37c69b9cf1af59c94fd8b783 --- eur-lex-scraper/src/parsers/section.rs +++ eur-lex-scraper/src/parsers/section.rs @@ -10,6 +10,8 @@ pub struct SectionParser {} pub enum SectionParserError { #[error("error while parsing section")] GenericError, + #[error("error while parsing the title")] + TitleError, #[error("error parsing article")] ArticleError(ArticleParserError), } @@ -23,6 +25,13 @@ impl From for SectionParserError { impl SectionParser { pub fn parse(element: ElementRef) -> Result { let mut section = Section::default(); + // This class should appear only once per section + let section_title_selector = Selector::parse(r#".oj-ti-section-2"#).unwrap(); + let title = match element.select(§ion_title_selector).next() { + Some(title) => title.inner_html(), + None => return Err(SectionParserError::TitleError), + }; + section.set_title(nanohtml2text::html2text(&title).trim().to_string()); // select article but not titles let article_selector = Selector::parse(r#"[id^="art_"]:not([id*=".tit"])"#).unwrap(); for article in element.select(&article_selector) { @@ -44,11 +53,15 @@ mod tests { } #[test] - fn parsing_article() { + fn parsing_section_1() { let html = Html::parse_fragment(&get_test_section_1()); let selector = Selector::parse(r#"[id*="sct_"]:not([id*=".tit_"])"#).unwrap(); let element_ref = html.select(&selector).next().unwrap(); let section_left = SectionParser::parse(element_ref).unwrap(); + assert_eq!( + section_left.get_title(), + "Classification of AI systems as high-risk" + ); assert_eq!(section_left.len(), 2); assert_eq!(section_left.get(0).unwrap().get_number(), 6); assert_eq!(