From 8e3dd731c5fb3f43d0d3bcd8f0aecf98fddd1933 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20H=C3=B6lting?= <87192362+moritz-hoelting@users.noreply.github.com> Date: Mon, 15 Dec 2025 22:51:20 +0100 Subject: [PATCH] scrape nutritional values --- ...251215213415_add_nutrition_values.down.sql | 15 +++ ...20251215213415_add_nutrition_values.up.sql | 34 ++++++ scraper/src/dish.rs | 114 ++++++++++++++---- scraper/src/main.rs | 2 +- scraper/src/menu.rs | 67 ++++++---- scraper/src/util.rs | 22 ++-- 6 files changed, 194 insertions(+), 60 deletions(-) create mode 100644 migrations/20251215213415_add_nutrition_values.down.sql create mode 100644 migrations/20251215213415_add_nutrition_values.up.sql diff --git a/migrations/20251215213415_add_nutrition_values.down.sql b/migrations/20251215213415_add_nutrition_values.down.sql new file mode 100644 index 0000000..a96635f --- /dev/null +++ b/migrations/20251215213415_add_nutrition_values.down.sql @@ -0,0 +1,15 @@ +-- Add down migration script here + +DROP VIEW IF EXISTS meals_view; + +ALTER TABLE meals +DROP COLUMN kjoules; + +ALTER TABLE meals +DROP COLUMN proteins; + +ALTER TABLE meals +DROP COLUMN carbohydrates; + +ALTER TABLE meals +DROP COLUMN fats; \ No newline at end of file diff --git a/migrations/20251215213415_add_nutrition_values.up.sql b/migrations/20251215213415_add_nutrition_values.up.sql new file mode 100644 index 0000000..88c5b44 --- /dev/null +++ b/migrations/20251215213415_add_nutrition_values.up.sql @@ -0,0 +1,34 @@ +-- Add up migration script here + +ALTER TABLE meals +ADD COLUMN kjoules INT; + +ALTER TABLE meals +ADD COLUMN proteins NUMERIC(6,2); + +ALTER TABLE meals +ADD COLUMN carbohydrates NUMERIC(6,2); + +ALTER TABLE meals +ADD COLUMN fats NUMERIC(6,2); + +CREATE OR REPLACE VIEW meals_view AS +SELECT + id, + date, + canteen, + name, + dish_type, + image_src, + price_students, + price_employees, + price_guests, + vegan, + vegetarian, + kjoules, + proteins, + carbohydrates, + fats, + round(kjoules / 4.184) AS kcal +FROM meals +WHERE is_latest = TRUE; \ No newline at end of file diff --git a/scraper/src/dish.rs b/scraper/src/dish.rs index 2bb8bcf..c665bb1 100644 --- a/scraper/src/dish.rs +++ b/scraper/src/dish.rs @@ -1,30 +1,52 @@ +use std::sync::LazyLock; + use itertools::Itertools; -use scraper::ElementRef; +use num_bigint::BigInt; +use scraper::{ElementRef, Selector}; use shared::DishType; +use sqlx::types::BigDecimal; + +static IMG_SELECTOR: LazyLock = + LazyLock::new(|| Selector::parse(".img img").expect("Failed to parse selector")); +static HTML_PRICE_SELECTOR: LazyLock = + LazyLock::new(|| Selector::parse(".desc .price").expect("Failed to parse selector")); +static HTML_EXTRAS_SELECTOR: LazyLock = + LazyLock::new(|| Selector::parse(".desc .buttons > *").expect("Failed to parse selector")); +static HTML_NUTRITIONS_SELECTOR: LazyLock = + LazyLock::new(|| Selector::parse(".nutritions > p").expect("Failed to parse selector")); #[derive(Debug, Clone, PartialEq, Eq)] pub struct Dish { name: String, image_src: Option, - price_students: Option, - price_employees: Option, - price_guests: Option, + price_students: BigDecimal, + price_employees: BigDecimal, + price_guests: BigDecimal, extras: Vec, dish_type: DishType, + pub nutrition_values: NutritionValues, +} + +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct NutritionValues { + pub kjoule: Option, + pub protein: Option, + pub carbs: Option, + pub fat: Option, } impl Dish { pub fn get_name(&self) -> &str { &self.name } - pub fn get_price_students(&self) -> Option<&str> { - self.price_students.as_deref() + pub fn get_price_students(&self) -> &BigDecimal { + &self.price_students } - pub fn get_price_employees(&self) -> Option<&str> { - self.price_employees.as_deref() + pub fn get_price_employees(&self) -> &BigDecimal { + &self.price_employees } - pub fn get_price_guests(&self) -> Option<&str> { - self.price_guests.as_deref() + pub fn get_price_guests(&self) -> &BigDecimal { + &self.price_guests } pub fn get_image_src(&self) -> Option<&str> { self.image_src.as_deref() @@ -51,8 +73,12 @@ impl Dish { == self.extras.iter().sorted().collect_vec() } - pub fn from_element(element: ElementRef, dish_type: DishType) -> Option { - let html_name_selector = scraper::Selector::parse(".desc h4").ok()?; + pub fn from_element( + element: ElementRef, + details: ElementRef, + dish_type: DishType, + ) -> Option { + let html_name_selector = Selector::parse(".desc h4").ok()?; let name = element .select(&html_name_selector) .next()? @@ -62,16 +88,14 @@ impl Dish { .trim() .to_string(); - let img_selector = scraper::Selector::parse(".img img").ok()?; - let img_src = element.select(&img_selector).next().and_then(|el| { + let img_src = element.select(&IMG_SELECTOR).next().and_then(|el| { el.value() .attr("src") .map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path)) }); - let html_price_selector = scraper::Selector::parse(".desc .price").ok()?; let mut prices = element - .select(&html_price_selector) + .select(&HTML_PRICE_SELECTOR) .filter_map(|price| { let price_for = price.first_child().and_then(|strong| { strong.first_child().and_then(|text_element| { @@ -92,29 +116,64 @@ impl Dish { }) .collect::>(); - let html_extras_selector = scraper::Selector::parse(".desc .buttons > *").ok()?; let extras = element - .select(&html_extras_selector) + .select(&HTML_EXTRAS_SELECTOR) .filter_map(|extra| extra.value().attr("title").map(|title| title.to_string())) .collect::>(); + let nutritions_element = details.select(&HTML_NUTRITIONS_SELECTOR).next(); + let nutrition_values = if let Some(nutritions_element) = nutritions_element { + let mut kjoule = None; + let mut protein = None; + let mut carbs = None; + let mut fat = None; + + for s in nutritions_element.text() { + let s = s.trim(); + if !s.is_empty() { + if let Some(rest) = s.strip_prefix("Brennwert = ") { + kjoule = rest + .split_whitespace() + .next() + .and_then(|num_str| num_str.parse().ok()); + } else if let Some(rest) = s.strip_prefix("Eiweiß = ") { + protein = grams_to_bigdecimal(rest); + } else if let Some(rest) = s.strip_prefix("Kohlenhydrate = ") { + carbs = grams_to_bigdecimal(rest); + } else if let Some(rest) = s.strip_prefix("Fett = ") { + fat = grams_to_bigdecimal(rest); + } + } + } + + NutritionValues { + kjoule, + protein, + carbs, + fat, + } + } else { + NutritionValues::default() + }; + Some(Self { name, image_src: img_src, price_students: prices .iter_mut() .find(|(price_for, _)| price_for == "Studierende") - .map(|(_, price)| std::mem::take(price)), + .map(|(_, price)| price_to_bigdecimal(Some(price)))?, price_employees: prices .iter_mut() .find(|(price_for, _)| price_for == "Bedienstete") - .map(|(_, price)| std::mem::take(price)), + .map(|(_, price)| price_to_bigdecimal(Some(price)))?, price_guests: prices .iter_mut() .find(|(price_for, _)| price_for == "Gäste") - .map(|(_, price)| std::mem::take(price)), + .map(|(_, price)| price_to_bigdecimal(Some(price)))?, extras, dish_type, + nutrition_values, }) } } @@ -124,3 +183,16 @@ impl PartialOrd for Dish { self.name.partial_cmp(&other.name) } } + +fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal { + s.and_then(|p| p.trim_end_matches(" €").replace(',', ".").parse().ok()) + .unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2)) +} + +fn grams_to_bigdecimal(s: &str) -> Option { + s.trim_end_matches("g") + .replace(',', ".") + .trim() + .parse() + .ok() +} diff --git a/scraper/src/main.rs b/scraper/src/main.rs index 9e0607a..4d5effc 100644 --- a/scraper/src/main.rs +++ b/scraper/src/main.rs @@ -47,7 +47,7 @@ async fn main() -> Result<()> { }) .unwrap_or_default(); - let date_canteen_combinations = (0..7) + let date_canteen_combinations = (0..1) .map(|d| (Utc::now() + Duration::days(d)).date_naive()) .cartesian_product(Canteen::iter()) .filter(|entry @ (_, canteen)| { diff --git a/scraper/src/menu.rs b/scraper/src/menu.rs index e33a48b..3f40085 100644 --- a/scraper/src/menu.rs +++ b/scraper/src/menu.rs @@ -1,9 +1,22 @@ +use std::sync::LazyLock; + use anyhow::Result; use chrono::NaiveDate; +use scraper::{Html, Selector}; use shared::{Canteen, DishType}; use crate::{canteen::CanteenExt as _, CustomError, Dish}; +static HTML_MAIN_DISHES_TBODY_SELECTOR: LazyLock = LazyLock::new(|| { + Selector::parse("table.table-dishes.main-dishes > tbody").expect("Failed to parse selector") +}); +static HTML_SIDE_DISHES_TBODY_SELECTOR: LazyLock = LazyLock::new(|| { + Selector::parse("table.table-dishes.side-dishes > tbody").expect("Failed to parse selector") +}); +static HTML_DESSERTS_TBODY_SELECTOR: LazyLock = LazyLock::new(|| { + Selector::parse("table.table-dishes.soups > tbody").expect("Failed to parse selector") +}); + #[tracing::instrument] pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result> { tracing::debug!("Starting scraping"); @@ -19,39 +32,41 @@ pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result let document = scraper::Html::parse_document(&html_content); - let html_main_dishes_selector = scraper::Selector::parse( - "table.table-dishes.main-dishes > tbody > tr.odd > td.description > div.row", - ) - .map_err(|_| CustomError::from("Failed to parse selector"))?; - let html_main_dishes = document.select(&html_main_dishes_selector); - let main_dishes = html_main_dishes - .filter_map(|dish| Dish::from_element(dish, DishType::Main)) - .collect::>(); - - let html_side_dishes_selector = scraper::Selector::parse( - "table.table-dishes.side-dishes > tbody > tr.odd > td.description > div.row", - ) - .map_err(|_| CustomError::from("Failed to parse selector"))?; - let html_side_dishes = document.select(&html_side_dishes_selector); - let side_dishes = html_side_dishes - .filter_map(|dish| Dish::from_element(dish, DishType::Side)) - .collect::>(); - - let html_desserts_selector = scraper::Selector::parse( - "table.table-dishes.soups > tbody > tr.odd > td.description > div.row", - ) - .map_err(|_| CustomError::from("Failed to parse selector"))?; - let html_desserts = document.select(&html_desserts_selector); - let desserts = html_desserts - .filter_map(|dish| Dish::from_element(dish, DishType::Dessert)) - .collect::>(); + let main_dishes = scrape_category(&document, &HTML_MAIN_DISHES_TBODY_SELECTOR, DishType::Main)?; + let side_dishes = scrape_category(&document, &HTML_SIDE_DISHES_TBODY_SELECTOR, DishType::Side)?; + let desserts = scrape_category(&document, &HTML_DESSERTS_TBODY_SELECTOR, DishType::Dessert)?; let mut res = Vec::new(); res.extend(main_dishes); res.extend(side_dishes); res.extend(desserts); + dbg!(&res); + tracing::debug!("Finished scraping"); Ok(res) } + +static ITEM_SELECTOR: LazyLock = LazyLock::new(|| { + Selector::parse("tr.odd > td.description > div.row").expect("Failed to parse selector") +}); +static ITEM_DETAILS_SELECTOR: LazyLock = LazyLock::new(|| { + Selector::parse("tr.even > td.more > div.ingredients-list").expect("Failed to parse selector") +}); + +fn scrape_category<'a>( + document: &'a Html, + tbody_selector: &Selector, + dish_type: DishType, +) -> Result + 'a> { + let tbody = document.select(tbody_selector).next().ok_or_else(|| { + CustomError::from(format!("No tbody found for selector: {:?}", tbody_selector)) + })?; + let dishes = tbody.select(&ITEM_SELECTOR); + let dish_details = tbody.select(&ITEM_DETAILS_SELECTOR); + + Ok(dishes + .zip(dish_details) + .filter_map(move |(dish, details)| Dish::from_element(dish, details, dish_type))) +} diff --git a/scraper/src/util.rs b/scraper/src/util.rs index 6854199..1c12f98 100644 --- a/scraper/src/util.rs +++ b/scraper/src/util.rs @@ -3,9 +3,8 @@ use std::env; use anyhow::Result; use chrono::NaiveDate; use futures::StreamExt as _; -use num_bigint::BigInt; use shared::{Canteen, DishType}; -use sqlx::{postgres::PgPoolOptions, types::BigDecimal, PgPool, PgTransaction}; +use sqlx::{postgres::PgPoolOptions, PgPool, PgTransaction}; use crate::{scrape_menu, Dish}; @@ -71,7 +70,7 @@ pub async fn add_menu_to_db( return Ok(()); } - let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian) "); + let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian,kjoules,proteins,carbohydrates,fats) "); query .push_values(menu, |mut sep, item| { @@ -82,11 +81,15 @@ pub async fn add_menu_to_db( .push_bind(item.get_name().to_string()) .push_bind(item.get_type() as DishType) .push_bind(item.get_image_src().map(str::to_string)) - .push_bind(price_to_bigdecimal(item.get_price_students())) - .push_bind(price_to_bigdecimal(item.get_price_employees())) - .push_bind(price_to_bigdecimal(item.get_price_guests())) + .push_bind(item.get_price_students().to_owned()) + .push_bind(item.get_price_employees().to_owned()) + .push_bind(item.get_price_guests().to_owned()) .push_bind(vegan) - .push_bind(vegan || item.is_vegetarian()); + .push_bind(vegan || item.is_vegetarian()) + .push_bind(item.nutrition_values.kjoule) + .push_bind(item.nutrition_values.protein.to_owned()) + .push_bind(item.nutrition_values.carbs.to_owned()) + .push_bind(item.nutrition_values.fat.to_owned()); }) .build() .execute(&mut **db) @@ -104,8 +107,3 @@ pub async fn add_menu_to_db( Ok(()) } - -pub fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal { - s.and_then(|p| p.trim_end_matches(" €").replace(',', ".").parse().ok()) - .unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2)) -}