diff --git a/.sqlx/query-87707bff13b4ce6ff47d2f79ee5d40b677042a20c217acc347ecdd04ebf3e6e0.json b/.sqlx/query-87707bff13b4ce6ff47d2f79ee5d40b677042a20c217acc347ecdd04ebf3e6e0.json new file mode 100644 index 0000000..7ba5e9b --- /dev/null +++ b/.sqlx/query-87707bff13b4ce6ff47d2f79ee5d40b677042a20c217acc347ecdd04ebf3e6e0.json @@ -0,0 +1,106 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT canteen, name, image_src, price_students, price_employees, price_guests, vegetarian, vegan, dish_type AS \"dish_type: DishType\", kjoules, proteins, carbohydrates, fats FROM meals WHERE date = $1 AND is_latest = TRUE AND canteen = ANY($2)", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "canteen", + "type_info": "Text" + }, + { + "ordinal": 1, + "name": "name", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "image_src", + "type_info": "Text" + }, + { + "ordinal": 3, + "name": "price_students", + "type_info": "Numeric" + }, + { + "ordinal": 4, + "name": "price_employees", + "type_info": "Numeric" + }, + { + "ordinal": 5, + "name": "price_guests", + "type_info": "Numeric" + }, + { + "ordinal": 6, + "name": "vegetarian", + "type_info": "Bool" + }, + { + "ordinal": 7, + "name": "vegan", + "type_info": "Bool" + }, + { + "ordinal": 8, + "name": "dish_type: DishType", + "type_info": { + "Custom": { + "name": "dish_type_enum", + "kind": { + "Enum": [ + "main", + "side", + "dessert" + ] + } + } + } + }, + { + "ordinal": 9, + "name": "kjoules", + "type_info": "Int4" + }, + { + "ordinal": 10, + "name": "proteins", + "type_info": "Numeric" + }, + { + "ordinal": 11, + "name": "carbohydrates", + "type_info": "Numeric" + }, + { + "ordinal": 12, + "name": "fats", + "type_info": "Numeric" + } + ], + "parameters": { + "Left": [ + "Date", + "TextArray" + ] + }, + "nullable": [ + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + true, + true, + true + ] + }, + "hash": "87707bff13b4ce6ff47d2f79ee5d40b677042a20c217acc347ecdd04ebf3e6e0" +} diff --git a/scraper/src/dish.rs b/scraper/src/dish.rs index 80aa1d7..5124e10 100644 --- a/scraper/src/dish.rs +++ b/scraper/src/dish.rs @@ -1,11 +1,12 @@ use std::sync::LazyLock; -use itertools::Itertools; use num_bigint::BigInt; use scraper::{ElementRef, Selector}; use shared::DishType; use sqlx::types::BigDecimal; +use crate::util::normalize_price_bigdecimal; + static IMG_SELECTOR: LazyLock = LazyLock::new(|| Selector::parse(".img img").expect("Failed to parse selector")); static HTML_PRICE_SELECTOR: LazyLock = @@ -15,19 +16,20 @@ static HTML_EXTRAS_SELECTOR: LazyLock = static HTML_NUTRITIONS_SELECTOR: LazyLock = LazyLock::new(|| Selector::parse(".nutritions > p").expect("Failed to parse selector")); -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Dish { - name: String, - image_src: Option, - price_students: BigDecimal, - price_employees: BigDecimal, - price_guests: BigDecimal, - extras: Vec, - dish_type: DishType, + pub name: String, + pub image_src: Option, + pub price_students: BigDecimal, + pub price_employees: BigDecimal, + pub price_guests: BigDecimal, + pub vegetarian: bool, + pub vegan: bool, + pub dish_type: DishType, pub nutrition_values: NutritionValues, } -#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] pub struct NutritionValues { pub kjoule: Option, pub protein: Option, @@ -52,13 +54,10 @@ impl Dish { self.image_src.as_deref() } pub fn is_vegan(&self) -> bool { - self.extras.contains(&"vegan".to_string()) + self.vegan } pub fn is_vegetarian(&self) -> bool { - self.extras.contains(&"vegetarisch".to_string()) - } - pub fn get_extras(&self) -> &[String] { - &self.extras + self.vegetarian } pub fn get_type(&self) -> DishType { self.dish_type @@ -69,8 +68,9 @@ impl Dish { && self.price_employees == other.price_employees && self.price_guests == other.price_guests && self.price_students == other.price_students - && self.extras.iter().sorted().collect_vec() - == self.extras.iter().sorted().collect_vec() + && self.vegan == other.vegan + && self.vegetarian == other.vegetarian + && self.dish_type == other.dish_type } pub fn from_element( @@ -156,6 +156,8 @@ impl Dish { NutritionValues::default() }; + let vegan = extras.contains(&"vegan".to_string()); + Some(Self { name, image_src: img_src, @@ -171,13 +173,25 @@ impl Dish { .iter_mut() .find(|(price_for, _)| price_for == "Gäste") .map(|(_, price)| price_to_bigdecimal(Some(price)))?, - extras, + vegetarian: vegan || extras.contains(&"vegetarisch".to_string()), + vegan, dish_type, - nutrition_values, + nutrition_values: nutrition_values.normalize(), }) } } +impl NutritionValues { + pub fn normalize(self) -> Self { + Self { + kjoule: self.kjoule, + protein: self.protein.map(|p| p.with_prec(6).with_scale(2)), + carbs: self.carbs.map(|c| c.with_prec(6).with_scale(2)), + fat: self.fat.map(|f| f.with_prec(6).with_scale(2)), + } + } +} + impl PartialOrd for Dish { fn partial_cmp(&self, other: &Self) -> Option { self.name.partial_cmp(&other.name) @@ -185,8 +199,14 @@ impl PartialOrd for Dish { } fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal { - s.and_then(|p| p.trim_end_matches(" €").replace(',', ".").parse().ok()) - .unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2)) + s.and_then(|p| { + p.trim_end_matches(" €") + .replace(',', ".") + .parse::() + .ok() + }) + .map(normalize_price_bigdecimal) + .unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2)) } fn grams_to_bigdecimal(s: &str) -> Option { diff --git a/scraper/src/lib.rs b/scraper/src/lib.rs index 05878f5..094b2be 100644 --- a/scraper/src/lib.rs +++ b/scraper/src/lib.rs @@ -10,7 +10,7 @@ pub use dish::Dish; pub use menu::scrape_menu; pub use refresh::check_refresh; use shared::Canteen; -pub use util::scrape_canteens_at_days; +pub use util::scrape_canteens_at_days_and_insert; #[derive(Debug, Clone)] struct CustomError(String); diff --git a/scraper/src/main.rs b/scraper/src/main.rs index b1339be..d523ed4 100644 --- a/scraper/src/main.rs +++ b/scraper/src/main.rs @@ -46,7 +46,7 @@ async fn main() -> Result<()> { }) .collect::>(); - util::scrape_canteens_at_days(&db, &date_canteen_combinations).await?; + util::scrape_canteens_at_days_and_insert(&db, &date_canteen_combinations).await?; tracing::info!("Finished scraping menu"); diff --git a/scraper/src/refresh.rs b/scraper/src/refresh.rs index 0d2ae0d..322ee53 100644 --- a/scraper/src/refresh.rs +++ b/scraper/src/refresh.rs @@ -5,11 +5,17 @@ use std::{ }; use chrono::{NaiveDate, Utc}; +use futures::{StreamExt, TryStreamExt as _}; use itertools::Itertools; -use shared::Canteen; +use shared::{Canteen, DishType}; +use sqlx::QueryBuilder; use strum::IntoEnumIterator as _; -use crate::util; +use crate::{ + dish::NutritionValues, + util::{self, add_menu_to_db, normalize_price_bigdecimal}, + Dish, +}; static NON_FILTERED_CANTEENS: LazyLock> = LazyLock::new(|| { let all_canteens = Canteen::iter().collect::>(); @@ -61,20 +67,70 @@ pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Cante canteens_needing_refresh ); - if let Err(err) = util::scrape_canteens_at_days( - db, + let canteen_date_pairs = canteens_needing_refresh + .iter() + .map(|c| (date, *c)) + .collect::>(); + + let scraped_dishes = util::scrape_canteens_at_days(&canteen_date_pairs) + .filter_map(|res| async move { res.ok() }) + .flat_map(|(_, canteen, menu)| { + futures::stream::iter(menu).map(move |dish| (canteen, dish)) + }) + .collect::>(); + + let db_data = sqlx::query!( + r#"SELECT canteen, name, image_src, price_students, price_employees, price_guests, vegetarian, vegan, dish_type AS "dish_type: DishType", kjoules, proteins, carbohydrates, fats FROM meals WHERE date = $1 AND is_latest = TRUE AND canteen = ANY($2)"#, + date, &canteens_needing_refresh .iter() - .map(|c| (date, *c)) + .map(|c| c.get_identifier().to_string()) .collect::>(), + ).map(|r| { + ( + Canteen::from_str(&r.canteen).expect("malformed db entry") , + Dish { + name: r.name, + image_src: r.image_src, + price_students: normalize_price_bigdecimal(r.price_students), + price_employees: normalize_price_bigdecimal(r.price_employees), + price_guests: normalize_price_bigdecimal(r.price_guests), + vegetarian: r.vegetarian, + vegan: r.vegan, + dish_type: r.dish_type, + nutrition_values: NutritionValues { + kjoule: r.kjoules, + protein: r.proteins, + carbs: r.carbohydrates, + fat: r.fats, + }.normalize(), + } ) - .await - { - tracing::error!("Error during refresh scrape: {}", err); - return false; - } + }).fetch(db).try_collect::>(); - true + let (scraped_dishes, db_data) = futures::join!(scraped_dishes, db_data); + + match db_data { + Ok(db_dishes) => { + let stale_dishes = db_dishes + .difference(&scraped_dishes) + .collect::>(); + let new_dishes = scraped_dishes + .difference(&db_dishes) + .collect::>(); + + if let Err(err) = update_stale_dishes(db, date, &stale_dishes, &new_dishes).await { + tracing::error!("Error updating stale dishes in db: {}", err); + false + } else { + true + } + } + Err(err) => { + tracing::error!("Error fetching existing dishes from db: {}", err); + false + } + } } } @@ -89,3 +145,44 @@ fn needs_refresh(last_refreshed: chrono::DateTime, date_entry: chrono::Naiv now.signed_duration_since(last_refreshed) >= chrono::Duration::days(2) } } + +async fn update_stale_dishes( + db: &sqlx::PgPool, + date: NaiveDate, + stale_dishes: &HashSet<&(Canteen, Dish)>, + new_dishes: &HashSet<&(Canteen, Dish)>, +) -> Result<(), sqlx::Error> { + let mut tx = db.begin().await?; + + QueryBuilder::new("UPDATE meals SET is_latest = FALSE WHERE date = ") + .push_bind(date) + .push(r#" AND ("name", canteen) IN "#) + .push_tuples(stale_dishes, |mut sep, (canteen, dish)| { + sep.push_bind(&dish.name) + .push_bind(canteen.get_identifier()); + }) + .push(";") + .build() + .execute(&mut *tx) + .await?; + + let chunks = new_dishes + .iter() + .sorted_by_key(|(c, _)| c) + .chunk_by(|(c, _)| c); + + let new_dishes_iter = chunks.into_iter().map(|(canteen, g)| { + ( + *canteen, + g.map(|(_, dish)| dish).cloned().collect::>(), + ) + }); + + for (canteen, menu) in new_dishes_iter { + add_menu_to_db(&mut tx, &date, canteen, menu).await?; + } + + tx.commit().await?; + + Ok(()) +} diff --git a/scraper/src/util.rs b/scraper/src/util.rs index 1c12f98..19f3076 100644 --- a/scraper/src/util.rs +++ b/scraper/src/util.rs @@ -2,9 +2,9 @@ use std::env; use anyhow::Result; use chrono::NaiveDate; -use futures::StreamExt as _; +use futures::{Stream, StreamExt as _}; use shared::{Canteen, DishType}; -use sqlx::{postgres::PgPoolOptions, PgPool, PgTransaction}; +use sqlx::{postgres::PgPoolOptions, types::BigDecimal, PgPool, PgTransaction}; use crate::{scrape_menu, Dish}; @@ -13,7 +13,7 @@ pub fn get_db() -> Result { .connect_lazy(&env::var("DATABASE_URL").expect("missing DATABASE_URL env variable"))?) } -pub async fn scrape_canteens_at_days( +pub async fn scrape_canteens_at_days_and_insert( db: &PgPool, date_canteen_combinations: &[(NaiveDate, Canteen)], ) -> Result<()> { @@ -40,26 +40,45 @@ pub async fn scrape_canteens_at_days( transaction.commit().await }); - futures::stream::iter(date_canteen_combinations) - .then(|(date, canteen)| async move { (*date, *canteen, scrape_menu(date, *canteen).await) }) - .filter_map( - |(date, canteen, menu)| async move { menu.ok().map(|menu| (date, canteen, menu)) }, - ) - .for_each(|(date, canteen, menu)| { + let errs = scrape_canteens_at_days(date_canteen_combinations) + .then(|res| { let tx = tx.clone(); async move { - tx.send((date, canteen, menu)).await.ok(); + match res { + Ok((date, canteen, menu)) => { + tx.send((date, canteen, menu)).await.ok(); + Ok(()) + } + Err(err) => { + tracing::error!("Error scraping menu: {err}"); + Err(err) + } + } } }) + .collect::>() .await; drop(tx); - insert_handle.await??; + if let Some(err) = errs.into_iter().find_map(Result::err) { + return Err(err); + } + Ok(()) } +pub fn scrape_canteens_at_days<'a>( + date_canteen_combinations: &'a [(NaiveDate, Canteen)], +) -> impl Stream)>> + 'a { + futures::stream::iter(date_canteen_combinations).then(|(date, canteen)| async move { + scrape_menu(date, *canteen) + .await + .map(|menu| (*date, *canteen, menu)) + }) +} + pub async fn add_menu_to_db( db: &mut PgTransaction<'_>, date: &NaiveDate, @@ -107,3 +126,7 @@ pub async fn add_menu_to_db( Ok(()) } + +pub fn normalize_price_bigdecimal(price: BigDecimal) -> BigDecimal { + price.with_prec(6).with_scale(2) +} diff --git a/shared/src/lib.rs b/shared/src/lib.rs index e1c7ffc..c9de741 100644 --- a/shared/src/lib.rs +++ b/shared/src/lib.rs @@ -3,7 +3,7 @@ use std::fmt::Display; mod canteen; pub use canteen::Canteen; -#[derive(Debug, Clone, Copy, PartialEq, Eq, sqlx::Type)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, sqlx::Type)] #[sqlx(type_name = "dish_type_enum")] #[sqlx(rename_all = "lowercase")] pub enum DishType {