scrape nutritional values

This commit is contained in:
Moritz Hölting 2025-12-15 22:51:20 +01:00
parent 4729c1afab
commit 8e3dd731c5
6 changed files with 194 additions and 60 deletions

View File

@ -0,0 +1,15 @@
-- Add down migration script here
DROP VIEW IF EXISTS meals_view;
ALTER TABLE meals
DROP COLUMN kjoules;
ALTER TABLE meals
DROP COLUMN proteins;
ALTER TABLE meals
DROP COLUMN carbohydrates;
ALTER TABLE meals
DROP COLUMN fats;

View File

@ -0,0 +1,34 @@
-- Add up migration script here
ALTER TABLE meals
ADD COLUMN kjoules INT;
ALTER TABLE meals
ADD COLUMN proteins NUMERIC(6,2);
ALTER TABLE meals
ADD COLUMN carbohydrates NUMERIC(6,2);
ALTER TABLE meals
ADD COLUMN fats NUMERIC(6,2);
CREATE OR REPLACE VIEW meals_view AS
SELECT
id,
date,
canteen,
name,
dish_type,
image_src,
price_students,
price_employees,
price_guests,
vegan,
vegetarian,
kjoules,
proteins,
carbohydrates,
fats,
round(kjoules / 4.184) AS kcal
FROM meals
WHERE is_latest = TRUE;

View File

@ -1,30 +1,52 @@
use std::sync::LazyLock;
use itertools::Itertools;
use scraper::ElementRef;
use num_bigint::BigInt;
use scraper::{ElementRef, Selector};
use shared::DishType;
use sqlx::types::BigDecimal;
static IMG_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".img img").expect("Failed to parse selector"));
static HTML_PRICE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".desc .price").expect("Failed to parse selector"));
static HTML_EXTRAS_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".desc .buttons > *").expect("Failed to parse selector"));
static HTML_NUTRITIONS_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".nutritions > p").expect("Failed to parse selector"));
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Dish {
name: String,
image_src: Option<String>,
price_students: Option<String>,
price_employees: Option<String>,
price_guests: Option<String>,
price_students: BigDecimal,
price_employees: BigDecimal,
price_guests: BigDecimal,
extras: Vec<String>,
dish_type: DishType,
pub nutrition_values: NutritionValues,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct NutritionValues {
pub kjoule: Option<i64>,
pub protein: Option<BigDecimal>,
pub carbs: Option<BigDecimal>,
pub fat: Option<BigDecimal>,
}
impl Dish {
pub fn get_name(&self) -> &str {
&self.name
}
pub fn get_price_students(&self) -> Option<&str> {
self.price_students.as_deref()
pub fn get_price_students(&self) -> &BigDecimal {
&self.price_students
}
pub fn get_price_employees(&self) -> Option<&str> {
self.price_employees.as_deref()
pub fn get_price_employees(&self) -> &BigDecimal {
&self.price_employees
}
pub fn get_price_guests(&self) -> Option<&str> {
self.price_guests.as_deref()
pub fn get_price_guests(&self) -> &BigDecimal {
&self.price_guests
}
pub fn get_image_src(&self) -> Option<&str> {
self.image_src.as_deref()
@ -51,8 +73,12 @@ impl Dish {
== self.extras.iter().sorted().collect_vec()
}
pub fn from_element(element: ElementRef, dish_type: DishType) -> Option<Self> {
let html_name_selector = scraper::Selector::parse(".desc h4").ok()?;
pub fn from_element(
element: ElementRef,
details: ElementRef,
dish_type: DishType,
) -> Option<Self> {
let html_name_selector = Selector::parse(".desc h4").ok()?;
let name = element
.select(&html_name_selector)
.next()?
@ -62,16 +88,14 @@ impl Dish {
.trim()
.to_string();
let img_selector = scraper::Selector::parse(".img img").ok()?;
let img_src = element.select(&img_selector).next().and_then(|el| {
let img_src = element.select(&IMG_SELECTOR).next().and_then(|el| {
el.value()
.attr("src")
.map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path))
});
let html_price_selector = scraper::Selector::parse(".desc .price").ok()?;
let mut prices = element
.select(&html_price_selector)
.select(&HTML_PRICE_SELECTOR)
.filter_map(|price| {
let price_for = price.first_child().and_then(|strong| {
strong.first_child().and_then(|text_element| {
@ -92,29 +116,64 @@ impl Dish {
})
.collect::<Vec<_>>();
let html_extras_selector = scraper::Selector::parse(".desc .buttons > *").ok()?;
let extras = element
.select(&html_extras_selector)
.select(&HTML_EXTRAS_SELECTOR)
.filter_map(|extra| extra.value().attr("title").map(|title| title.to_string()))
.collect::<Vec<_>>();
let nutritions_element = details.select(&HTML_NUTRITIONS_SELECTOR).next();
let nutrition_values = if let Some(nutritions_element) = nutritions_element {
let mut kjoule = None;
let mut protein = None;
let mut carbs = None;
let mut fat = None;
for s in nutritions_element.text() {
let s = s.trim();
if !s.is_empty() {
if let Some(rest) = s.strip_prefix("Brennwert = ") {
kjoule = rest
.split_whitespace()
.next()
.and_then(|num_str| num_str.parse().ok());
} else if let Some(rest) = s.strip_prefix("Eiweiß = ") {
protein = grams_to_bigdecimal(rest);
} else if let Some(rest) = s.strip_prefix("Kohlenhydrate = ") {
carbs = grams_to_bigdecimal(rest);
} else if let Some(rest) = s.strip_prefix("Fett = ") {
fat = grams_to_bigdecimal(rest);
}
}
}
NutritionValues {
kjoule,
protein,
carbs,
fat,
}
} else {
NutritionValues::default()
};
Some(Self {
name,
image_src: img_src,
price_students: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Studierende")
.map(|(_, price)| std::mem::take(price)),
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
price_employees: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Bedienstete")
.map(|(_, price)| std::mem::take(price)),
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
price_guests: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Gäste")
.map(|(_, price)| std::mem::take(price)),
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
extras,
dish_type,
nutrition_values,
})
}
}
@ -124,3 +183,16 @@ impl PartialOrd for Dish {
self.name.partial_cmp(&other.name)
}
}
fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
s.and_then(|p| p.trim_end_matches("").replace(',', ".").parse().ok())
.unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2))
}
fn grams_to_bigdecimal(s: &str) -> Option<BigDecimal> {
s.trim_end_matches("g")
.replace(',', ".")
.trim()
.parse()
.ok()
}

View File

@ -47,7 +47,7 @@ async fn main() -> Result<()> {
})
.unwrap_or_default();
let date_canteen_combinations = (0..7)
let date_canteen_combinations = (0..1)
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
.cartesian_product(Canteen::iter())
.filter(|entry @ (_, canteen)| {

View File

@ -1,9 +1,22 @@
use std::sync::LazyLock;
use anyhow::Result;
use chrono::NaiveDate;
use scraper::{Html, Selector};
use shared::{Canteen, DishType};
use crate::{canteen::CanteenExt as _, CustomError, Dish};
static HTML_MAIN_DISHES_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table.table-dishes.main-dishes > tbody").expect("Failed to parse selector")
});
static HTML_SIDE_DISHES_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table.table-dishes.side-dishes > tbody").expect("Failed to parse selector")
});
static HTML_DESSERTS_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table.table-dishes.soups > tbody").expect("Failed to parse selector")
});
#[tracing::instrument]
pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>> {
tracing::debug!("Starting scraping");
@ -19,39 +32,41 @@ pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>
let document = scraper::Html::parse_document(&html_content);
let html_main_dishes_selector = scraper::Selector::parse(
"table.table-dishes.main-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_main_dishes = document.select(&html_main_dishes_selector);
let main_dishes = html_main_dishes
.filter_map(|dish| Dish::from_element(dish, DishType::Main))
.collect::<Vec<_>>();
let html_side_dishes_selector = scraper::Selector::parse(
"table.table-dishes.side-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_side_dishes = document.select(&html_side_dishes_selector);
let side_dishes = html_side_dishes
.filter_map(|dish| Dish::from_element(dish, DishType::Side))
.collect::<Vec<_>>();
let html_desserts_selector = scraper::Selector::parse(
"table.table-dishes.soups > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_desserts = document.select(&html_desserts_selector);
let desserts = html_desserts
.filter_map(|dish| Dish::from_element(dish, DishType::Dessert))
.collect::<Vec<_>>();
let main_dishes = scrape_category(&document, &HTML_MAIN_DISHES_TBODY_SELECTOR, DishType::Main)?;
let side_dishes = scrape_category(&document, &HTML_SIDE_DISHES_TBODY_SELECTOR, DishType::Side)?;
let desserts = scrape_category(&document, &HTML_DESSERTS_TBODY_SELECTOR, DishType::Dessert)?;
let mut res = Vec::new();
res.extend(main_dishes);
res.extend(side_dishes);
res.extend(desserts);
dbg!(&res);
tracing::debug!("Finished scraping");
Ok(res)
}
static ITEM_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("tr.odd > td.description > div.row").expect("Failed to parse selector")
});
static ITEM_DETAILS_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("tr.even > td.more > div.ingredients-list").expect("Failed to parse selector")
});
fn scrape_category<'a>(
document: &'a Html,
tbody_selector: &Selector,
dish_type: DishType,
) -> Result<impl Iterator<Item = Dish> + 'a> {
let tbody = document.select(tbody_selector).next().ok_or_else(|| {
CustomError::from(format!("No tbody found for selector: {:?}", tbody_selector))
})?;
let dishes = tbody.select(&ITEM_SELECTOR);
let dish_details = tbody.select(&ITEM_DETAILS_SELECTOR);
Ok(dishes
.zip(dish_details)
.filter_map(move |(dish, details)| Dish::from_element(dish, details, dish_type)))
}

View File

@ -3,9 +3,8 @@ use std::env;
use anyhow::Result;
use chrono::NaiveDate;
use futures::StreamExt as _;
use num_bigint::BigInt;
use shared::{Canteen, DishType};
use sqlx::{postgres::PgPoolOptions, types::BigDecimal, PgPool, PgTransaction};
use sqlx::{postgres::PgPoolOptions, PgPool, PgTransaction};
use crate::{scrape_menu, Dish};
@ -71,7 +70,7 @@ pub async fn add_menu_to_db(
return Ok(());
}
let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian) ");
let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian,kjoules,proteins,carbohydrates,fats) ");
query
.push_values(menu, |mut sep, item| {
@ -82,11 +81,15 @@ pub async fn add_menu_to_db(
.push_bind(item.get_name().to_string())
.push_bind(item.get_type() as DishType)
.push_bind(item.get_image_src().map(str::to_string))
.push_bind(price_to_bigdecimal(item.get_price_students()))
.push_bind(price_to_bigdecimal(item.get_price_employees()))
.push_bind(price_to_bigdecimal(item.get_price_guests()))
.push_bind(item.get_price_students().to_owned())
.push_bind(item.get_price_employees().to_owned())
.push_bind(item.get_price_guests().to_owned())
.push_bind(vegan)
.push_bind(vegan || item.is_vegetarian());
.push_bind(vegan || item.is_vegetarian())
.push_bind(item.nutrition_values.kjoule)
.push_bind(item.nutrition_values.protein.to_owned())
.push_bind(item.nutrition_values.carbs.to_owned())
.push_bind(item.nutrition_values.fat.to_owned());
})
.build()
.execute(&mut **db)
@ -104,8 +107,3 @@ pub async fn add_menu_to_db(
Ok(())
}
pub fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
s.and_then(|p| p.trim_end_matches("").replace(',', ".").parse().ok())
.unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2))
}