commit a85d745221f8b5a40c756c3ab8a60e8e49f3d2ca from: Romain VINCENT date: Sat Jan 10 16:37:42 2026 UTC Add title to Article and some related tests. commit - 5e29929b3959d317321b463d031ce893f92c8c61 commit + a85d745221f8b5a40c756c3ab8a60e8e49f3d2ca blob - d56f0bb587161ce5d3c502e8725f2036ffc8c7aa blob + 9033263b69faa760527389cc64fe58f760605606 --- eur-lex-scraper-naive/src/models/articles.rs +++ eur-lex-scraper-naive/src/models/articles.rs @@ -2,6 +2,7 @@ use crate::models::enacting_terms::Item; #[derive(Debug, Default, PartialEq, Eq)] pub struct Article { + pub title: String, pub number: u32, pub text: String, } blob - aecc33e03d3cc640dd0e922a36574765f9ef04d7 blob + ee8b814984b4deee0de30c78b56ce4a99f6e0ac2 --- eur-lex-scraper-naive/src/parsers/article.rs +++ eur-lex-scraper-naive/src/parsers/article.rs @@ -1,6 +1,6 @@ use crate::models::articles::Article; use nanohtml2text::html2text; -use scraper::ElementRef; +use scraper::{ElementRef, Selector}; use thiserror::Error; pub struct ArticleParser {} @@ -24,8 +24,17 @@ impl ArticleParser { Ok(number) => number, Err(_) => return Err(ArticleParserError::ErrorNumber), }; + let article_title_selector = Selector::parse(r#".oj-sti-art"#).unwrap(); + let title = match element.select(&article_title_selector).next() { + Some(title) => title.inner_html(), + None => "".to_string(), + }; let text = html2text(&element.inner_html()); - let article = Article { number, text }; + let article = Article { + title, + number, + text, + }; Ok(article) } } @@ -164,6 +173,7 @@ mod tests { let element_ref = html.select(&selector).next().unwrap(); let article_left = ArticleParser::parse(element_ref).unwrap(); let article_right = Article { + title: "Subject matter`".to_string(), number: 1, text: html2text(&element_ref.inner_html()), }; blob - c5371e6926b80bd6c694beed7b1a850dd88803ee blob + 1e54aa19a2acfbddd28779e7d31ff3c84cb06dc9 --- eur-lex-scraper-naive/src/parsers/section.rs +++ eur-lex-scraper-naive/src/parsers/section.rs @@ -433,7 +433,15 @@ mod tests { let section_left = SectionParser::parse(element_ref).unwrap(); assert_eq!(section_left.items.len(), 2); assert_eq!(section_left.items.get(0).unwrap().number, 6); + assert_eq!( + section_left.items.get(0).unwrap().title, + "Classification rules for high-risk AI systems".to_string() + ); assert_eq!(section_left.items.get(1).unwrap().number, 7); + assert_eq!( + section_left.items.get(1).unwrap().title, + "Amendments to Annex III".to_string() + ); /* let article_right = Article {