commit - 5e29929b3959d317321b463d031ce893f92c8c61
commit + a85d745221f8b5a40c756c3ab8a60e8e49f3d2ca
blob - d56f0bb587161ce5d3c502e8725f2036ffc8c7aa
blob + 9033263b69faa760527389cc64fe58f760605606
--- eur-lex-scraper-naive/src/models/articles.rs
+++ eur-lex-scraper-naive/src/models/articles.rs
#[derive(Debug, Default, PartialEq, Eq)]
pub struct Article {
+ pub title: String,
pub number: u32,
pub text: String,
}
blob - aecc33e03d3cc640dd0e922a36574765f9ef04d7
blob + ee8b814984b4deee0de30c78b56ce4a99f6e0ac2
--- eur-lex-scraper-naive/src/parsers/article.rs
+++ eur-lex-scraper-naive/src/parsers/article.rs
use crate::models::articles::Article;
use nanohtml2text::html2text;
-use scraper::ElementRef;
+use scraper::{ElementRef, Selector};
use thiserror::Error;
pub struct ArticleParser {}
Ok(number) => number,
Err(_) => return Err(ArticleParserError::ErrorNumber),
};
+ let article_title_selector = Selector::parse(r#".oj-sti-art"#).unwrap();
+ let title = match element.select(&article_title_selector).next() {
+ Some(title) => title.inner_html(),
+ None => "".to_string(),
+ };
let text = html2text(&element.inner_html());
- let article = Article { number, text };
+ let article = Article {
+ title,
+ number,
+ text,
+ };
Ok(article)
}
}
let element_ref = html.select(&selector).next().unwrap();
let article_left = ArticleParser::parse(element_ref).unwrap();
let article_right = Article {
+ title: "Subject matter`".to_string(),
number: 1,
text: html2text(&element_ref.inner_html()),
};
blob - c5371e6926b80bd6c694beed7b1a850dd88803ee
blob + 1e54aa19a2acfbddd28779e7d31ff3c84cb06dc9
--- eur-lex-scraper-naive/src/parsers/section.rs
+++ eur-lex-scraper-naive/src/parsers/section.rs
let section_left = SectionParser::parse(element_ref).unwrap();
assert_eq!(section_left.items.len(), 2);
assert_eq!(section_left.items.get(0).unwrap().number, 6);
+ assert_eq!(
+ section_left.items.get(0).unwrap().title,
+ "Classification rules for high-risk AI systems".to_string()
+ );
assert_eq!(section_left.items.get(1).unwrap().number, 7);
+ assert_eq!(
+ section_left.items.get(1).unwrap().title,
+ "Amendments to Annex III".to_string()
+ );
/*
let article_right = Article {