scrape nutritional values
This commit is contained in:
parent
4729c1afab
commit
8e3dd731c5
|
|
@ -0,0 +1,15 @@
|
||||||
|
-- Add down migration script here
|
||||||
|
|
||||||
|
DROP VIEW IF EXISTS meals_view;
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
DROP COLUMN kjoules;
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
DROP COLUMN proteins;
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
DROP COLUMN carbohydrates;
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
DROP COLUMN fats;
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
-- Add up migration script here
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
ADD COLUMN kjoules INT;
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
ADD COLUMN proteins NUMERIC(6,2);
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
ADD COLUMN carbohydrates NUMERIC(6,2);
|
||||||
|
|
||||||
|
ALTER TABLE meals
|
||||||
|
ADD COLUMN fats NUMERIC(6,2);
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW meals_view AS
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
date,
|
||||||
|
canteen,
|
||||||
|
name,
|
||||||
|
dish_type,
|
||||||
|
image_src,
|
||||||
|
price_students,
|
||||||
|
price_employees,
|
||||||
|
price_guests,
|
||||||
|
vegan,
|
||||||
|
vegetarian,
|
||||||
|
kjoules,
|
||||||
|
proteins,
|
||||||
|
carbohydrates,
|
||||||
|
fats,
|
||||||
|
round(kjoules / 4.184) AS kcal
|
||||||
|
FROM meals
|
||||||
|
WHERE is_latest = TRUE;
|
||||||
|
|
@ -1,30 +1,52 @@
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use scraper::ElementRef;
|
use num_bigint::BigInt;
|
||||||
|
use scraper::{ElementRef, Selector};
|
||||||
use shared::DishType;
|
use shared::DishType;
|
||||||
|
use sqlx::types::BigDecimal;
|
||||||
|
|
||||||
|
static IMG_SELECTOR: LazyLock<Selector> =
|
||||||
|
LazyLock::new(|| Selector::parse(".img img").expect("Failed to parse selector"));
|
||||||
|
static HTML_PRICE_SELECTOR: LazyLock<Selector> =
|
||||||
|
LazyLock::new(|| Selector::parse(".desc .price").expect("Failed to parse selector"));
|
||||||
|
static HTML_EXTRAS_SELECTOR: LazyLock<Selector> =
|
||||||
|
LazyLock::new(|| Selector::parse(".desc .buttons > *").expect("Failed to parse selector"));
|
||||||
|
static HTML_NUTRITIONS_SELECTOR: LazyLock<Selector> =
|
||||||
|
LazyLock::new(|| Selector::parse(".nutritions > p").expect("Failed to parse selector"));
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct Dish {
|
pub struct Dish {
|
||||||
name: String,
|
name: String,
|
||||||
image_src: Option<String>,
|
image_src: Option<String>,
|
||||||
price_students: Option<String>,
|
price_students: BigDecimal,
|
||||||
price_employees: Option<String>,
|
price_employees: BigDecimal,
|
||||||
price_guests: Option<String>,
|
price_guests: BigDecimal,
|
||||||
extras: Vec<String>,
|
extras: Vec<String>,
|
||||||
dish_type: DishType,
|
dish_type: DishType,
|
||||||
|
pub nutrition_values: NutritionValues,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||||
|
pub struct NutritionValues {
|
||||||
|
pub kjoule: Option<i64>,
|
||||||
|
pub protein: Option<BigDecimal>,
|
||||||
|
pub carbs: Option<BigDecimal>,
|
||||||
|
pub fat: Option<BigDecimal>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Dish {
|
impl Dish {
|
||||||
pub fn get_name(&self) -> &str {
|
pub fn get_name(&self) -> &str {
|
||||||
&self.name
|
&self.name
|
||||||
}
|
}
|
||||||
pub fn get_price_students(&self) -> Option<&str> {
|
pub fn get_price_students(&self) -> &BigDecimal {
|
||||||
self.price_students.as_deref()
|
&self.price_students
|
||||||
}
|
}
|
||||||
pub fn get_price_employees(&self) -> Option<&str> {
|
pub fn get_price_employees(&self) -> &BigDecimal {
|
||||||
self.price_employees.as_deref()
|
&self.price_employees
|
||||||
}
|
}
|
||||||
pub fn get_price_guests(&self) -> Option<&str> {
|
pub fn get_price_guests(&self) -> &BigDecimal {
|
||||||
self.price_guests.as_deref()
|
&self.price_guests
|
||||||
}
|
}
|
||||||
pub fn get_image_src(&self) -> Option<&str> {
|
pub fn get_image_src(&self) -> Option<&str> {
|
||||||
self.image_src.as_deref()
|
self.image_src.as_deref()
|
||||||
|
|
@ -51,8 +73,12 @@ impl Dish {
|
||||||
== self.extras.iter().sorted().collect_vec()
|
== self.extras.iter().sorted().collect_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_element(element: ElementRef, dish_type: DishType) -> Option<Self> {
|
pub fn from_element(
|
||||||
let html_name_selector = scraper::Selector::parse(".desc h4").ok()?;
|
element: ElementRef,
|
||||||
|
details: ElementRef,
|
||||||
|
dish_type: DishType,
|
||||||
|
) -> Option<Self> {
|
||||||
|
let html_name_selector = Selector::parse(".desc h4").ok()?;
|
||||||
let name = element
|
let name = element
|
||||||
.select(&html_name_selector)
|
.select(&html_name_selector)
|
||||||
.next()?
|
.next()?
|
||||||
|
|
@ -62,16 +88,14 @@ impl Dish {
|
||||||
.trim()
|
.trim()
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
let img_selector = scraper::Selector::parse(".img img").ok()?;
|
let img_src = element.select(&IMG_SELECTOR).next().and_then(|el| {
|
||||||
let img_src = element.select(&img_selector).next().and_then(|el| {
|
|
||||||
el.value()
|
el.value()
|
||||||
.attr("src")
|
.attr("src")
|
||||||
.map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path))
|
.map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path))
|
||||||
});
|
});
|
||||||
|
|
||||||
let html_price_selector = scraper::Selector::parse(".desc .price").ok()?;
|
|
||||||
let mut prices = element
|
let mut prices = element
|
||||||
.select(&html_price_selector)
|
.select(&HTML_PRICE_SELECTOR)
|
||||||
.filter_map(|price| {
|
.filter_map(|price| {
|
||||||
let price_for = price.first_child().and_then(|strong| {
|
let price_for = price.first_child().and_then(|strong| {
|
||||||
strong.first_child().and_then(|text_element| {
|
strong.first_child().and_then(|text_element| {
|
||||||
|
|
@ -92,29 +116,64 @@ impl Dish {
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let html_extras_selector = scraper::Selector::parse(".desc .buttons > *").ok()?;
|
|
||||||
let extras = element
|
let extras = element
|
||||||
.select(&html_extras_selector)
|
.select(&HTML_EXTRAS_SELECTOR)
|
||||||
.filter_map(|extra| extra.value().attr("title").map(|title| title.to_string()))
|
.filter_map(|extra| extra.value().attr("title").map(|title| title.to_string()))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let nutritions_element = details.select(&HTML_NUTRITIONS_SELECTOR).next();
|
||||||
|
let nutrition_values = if let Some(nutritions_element) = nutritions_element {
|
||||||
|
let mut kjoule = None;
|
||||||
|
let mut protein = None;
|
||||||
|
let mut carbs = None;
|
||||||
|
let mut fat = None;
|
||||||
|
|
||||||
|
for s in nutritions_element.text() {
|
||||||
|
let s = s.trim();
|
||||||
|
if !s.is_empty() {
|
||||||
|
if let Some(rest) = s.strip_prefix("Brennwert = ") {
|
||||||
|
kjoule = rest
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.and_then(|num_str| num_str.parse().ok());
|
||||||
|
} else if let Some(rest) = s.strip_prefix("Eiweiß = ") {
|
||||||
|
protein = grams_to_bigdecimal(rest);
|
||||||
|
} else if let Some(rest) = s.strip_prefix("Kohlenhydrate = ") {
|
||||||
|
carbs = grams_to_bigdecimal(rest);
|
||||||
|
} else if let Some(rest) = s.strip_prefix("Fett = ") {
|
||||||
|
fat = grams_to_bigdecimal(rest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NutritionValues {
|
||||||
|
kjoule,
|
||||||
|
protein,
|
||||||
|
carbs,
|
||||||
|
fat,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
NutritionValues::default()
|
||||||
|
};
|
||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
name,
|
name,
|
||||||
image_src: img_src,
|
image_src: img_src,
|
||||||
price_students: prices
|
price_students: prices
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.find(|(price_for, _)| price_for == "Studierende")
|
.find(|(price_for, _)| price_for == "Studierende")
|
||||||
.map(|(_, price)| std::mem::take(price)),
|
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
|
||||||
price_employees: prices
|
price_employees: prices
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.find(|(price_for, _)| price_for == "Bedienstete")
|
.find(|(price_for, _)| price_for == "Bedienstete")
|
||||||
.map(|(_, price)| std::mem::take(price)),
|
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
|
||||||
price_guests: prices
|
price_guests: prices
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.find(|(price_for, _)| price_for == "Gäste")
|
.find(|(price_for, _)| price_for == "Gäste")
|
||||||
.map(|(_, price)| std::mem::take(price)),
|
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
|
||||||
extras,
|
extras,
|
||||||
dish_type,
|
dish_type,
|
||||||
|
nutrition_values,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -124,3 +183,16 @@ impl PartialOrd for Dish {
|
||||||
self.name.partial_cmp(&other.name)
|
self.name.partial_cmp(&other.name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
|
||||||
|
s.and_then(|p| p.trim_end_matches(" €").replace(',', ".").parse().ok())
|
||||||
|
.unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn grams_to_bigdecimal(s: &str) -> Option<BigDecimal> {
|
||||||
|
s.trim_end_matches("g")
|
||||||
|
.replace(',', ".")
|
||||||
|
.trim()
|
||||||
|
.parse()
|
||||||
|
.ok()
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,7 @@ async fn main() -> Result<()> {
|
||||||
})
|
})
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
let date_canteen_combinations = (0..7)
|
let date_canteen_combinations = (0..1)
|
||||||
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
|
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
|
||||||
.cartesian_product(Canteen::iter())
|
.cartesian_product(Canteen::iter())
|
||||||
.filter(|entry @ (_, canteen)| {
|
.filter(|entry @ (_, canteen)| {
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,22 @@
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use chrono::NaiveDate;
|
use chrono::NaiveDate;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
use shared::{Canteen, DishType};
|
use shared::{Canteen, DishType};
|
||||||
|
|
||||||
use crate::{canteen::CanteenExt as _, CustomError, Dish};
|
use crate::{canteen::CanteenExt as _, CustomError, Dish};
|
||||||
|
|
||||||
|
static HTML_MAIN_DISHES_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
|
||||||
|
Selector::parse("table.table-dishes.main-dishes > tbody").expect("Failed to parse selector")
|
||||||
|
});
|
||||||
|
static HTML_SIDE_DISHES_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
|
||||||
|
Selector::parse("table.table-dishes.side-dishes > tbody").expect("Failed to parse selector")
|
||||||
|
});
|
||||||
|
static HTML_DESSERTS_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
|
||||||
|
Selector::parse("table.table-dishes.soups > tbody").expect("Failed to parse selector")
|
||||||
|
});
|
||||||
|
|
||||||
#[tracing::instrument]
|
#[tracing::instrument]
|
||||||
pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>> {
|
pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>> {
|
||||||
tracing::debug!("Starting scraping");
|
tracing::debug!("Starting scraping");
|
||||||
|
|
@ -19,39 +32,41 @@ pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>
|
||||||
|
|
||||||
let document = scraper::Html::parse_document(&html_content);
|
let document = scraper::Html::parse_document(&html_content);
|
||||||
|
|
||||||
let html_main_dishes_selector = scraper::Selector::parse(
|
let main_dishes = scrape_category(&document, &HTML_MAIN_DISHES_TBODY_SELECTOR, DishType::Main)?;
|
||||||
"table.table-dishes.main-dishes > tbody > tr.odd > td.description > div.row",
|
let side_dishes = scrape_category(&document, &HTML_SIDE_DISHES_TBODY_SELECTOR, DishType::Side)?;
|
||||||
)
|
let desserts = scrape_category(&document, &HTML_DESSERTS_TBODY_SELECTOR, DishType::Dessert)?;
|
||||||
.map_err(|_| CustomError::from("Failed to parse selector"))?;
|
|
||||||
let html_main_dishes = document.select(&html_main_dishes_selector);
|
|
||||||
let main_dishes = html_main_dishes
|
|
||||||
.filter_map(|dish| Dish::from_element(dish, DishType::Main))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let html_side_dishes_selector = scraper::Selector::parse(
|
|
||||||
"table.table-dishes.side-dishes > tbody > tr.odd > td.description > div.row",
|
|
||||||
)
|
|
||||||
.map_err(|_| CustomError::from("Failed to parse selector"))?;
|
|
||||||
let html_side_dishes = document.select(&html_side_dishes_selector);
|
|
||||||
let side_dishes = html_side_dishes
|
|
||||||
.filter_map(|dish| Dish::from_element(dish, DishType::Side))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let html_desserts_selector = scraper::Selector::parse(
|
|
||||||
"table.table-dishes.soups > tbody > tr.odd > td.description > div.row",
|
|
||||||
)
|
|
||||||
.map_err(|_| CustomError::from("Failed to parse selector"))?;
|
|
||||||
let html_desserts = document.select(&html_desserts_selector);
|
|
||||||
let desserts = html_desserts
|
|
||||||
.filter_map(|dish| Dish::from_element(dish, DishType::Dessert))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let mut res = Vec::new();
|
let mut res = Vec::new();
|
||||||
res.extend(main_dishes);
|
res.extend(main_dishes);
|
||||||
res.extend(side_dishes);
|
res.extend(side_dishes);
|
||||||
res.extend(desserts);
|
res.extend(desserts);
|
||||||
|
|
||||||
|
dbg!(&res);
|
||||||
|
|
||||||
tracing::debug!("Finished scraping");
|
tracing::debug!("Finished scraping");
|
||||||
|
|
||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ITEM_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
|
||||||
|
Selector::parse("tr.odd > td.description > div.row").expect("Failed to parse selector")
|
||||||
|
});
|
||||||
|
static ITEM_DETAILS_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
|
||||||
|
Selector::parse("tr.even > td.more > div.ingredients-list").expect("Failed to parse selector")
|
||||||
|
});
|
||||||
|
|
||||||
|
fn scrape_category<'a>(
|
||||||
|
document: &'a Html,
|
||||||
|
tbody_selector: &Selector,
|
||||||
|
dish_type: DishType,
|
||||||
|
) -> Result<impl Iterator<Item = Dish> + 'a> {
|
||||||
|
let tbody = document.select(tbody_selector).next().ok_or_else(|| {
|
||||||
|
CustomError::from(format!("No tbody found for selector: {:?}", tbody_selector))
|
||||||
|
})?;
|
||||||
|
let dishes = tbody.select(&ITEM_SELECTOR);
|
||||||
|
let dish_details = tbody.select(&ITEM_DETAILS_SELECTOR);
|
||||||
|
|
||||||
|
Ok(dishes
|
||||||
|
.zip(dish_details)
|
||||||
|
.filter_map(move |(dish, details)| Dish::from_element(dish, details, dish_type)))
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,8 @@ use std::env;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use chrono::NaiveDate;
|
use chrono::NaiveDate;
|
||||||
use futures::StreamExt as _;
|
use futures::StreamExt as _;
|
||||||
use num_bigint::BigInt;
|
|
||||||
use shared::{Canteen, DishType};
|
use shared::{Canteen, DishType};
|
||||||
use sqlx::{postgres::PgPoolOptions, types::BigDecimal, PgPool, PgTransaction};
|
use sqlx::{postgres::PgPoolOptions, PgPool, PgTransaction};
|
||||||
|
|
||||||
use crate::{scrape_menu, Dish};
|
use crate::{scrape_menu, Dish};
|
||||||
|
|
||||||
|
|
@ -71,7 +70,7 @@ pub async fn add_menu_to_db(
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian) ");
|
let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian,kjoules,proteins,carbohydrates,fats) ");
|
||||||
|
|
||||||
query
|
query
|
||||||
.push_values(menu, |mut sep, item| {
|
.push_values(menu, |mut sep, item| {
|
||||||
|
|
@ -82,11 +81,15 @@ pub async fn add_menu_to_db(
|
||||||
.push_bind(item.get_name().to_string())
|
.push_bind(item.get_name().to_string())
|
||||||
.push_bind(item.get_type() as DishType)
|
.push_bind(item.get_type() as DishType)
|
||||||
.push_bind(item.get_image_src().map(str::to_string))
|
.push_bind(item.get_image_src().map(str::to_string))
|
||||||
.push_bind(price_to_bigdecimal(item.get_price_students()))
|
.push_bind(item.get_price_students().to_owned())
|
||||||
.push_bind(price_to_bigdecimal(item.get_price_employees()))
|
.push_bind(item.get_price_employees().to_owned())
|
||||||
.push_bind(price_to_bigdecimal(item.get_price_guests()))
|
.push_bind(item.get_price_guests().to_owned())
|
||||||
.push_bind(vegan)
|
.push_bind(vegan)
|
||||||
.push_bind(vegan || item.is_vegetarian());
|
.push_bind(vegan || item.is_vegetarian())
|
||||||
|
.push_bind(item.nutrition_values.kjoule)
|
||||||
|
.push_bind(item.nutrition_values.protein.to_owned())
|
||||||
|
.push_bind(item.nutrition_values.carbs.to_owned())
|
||||||
|
.push_bind(item.nutrition_values.fat.to_owned());
|
||||||
})
|
})
|
||||||
.build()
|
.build()
|
||||||
.execute(&mut **db)
|
.execute(&mut **db)
|
||||||
|
|
@ -104,8 +107,3 @@ pub async fn add_menu_to_db(
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
|
|
||||||
s.and_then(|p| p.trim_end_matches(" €").replace(',', ".").parse().ok())
|
|
||||||
.unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2))
|
|
||||||
}
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue