only update outdated db entries

This commit is contained in:
Moritz Hölting 2025-12-18 22:31:03 +01:00
parent 8a1df4946d
commit c56fe5a8a8
7 changed files with 292 additions and 46 deletions

View File

@ -0,0 +1,106 @@
{
"db_name": "PostgreSQL",
"query": "SELECT canteen, name, image_src, price_students, price_employees, price_guests, vegetarian, vegan, dish_type AS \"dish_type: DishType\", kjoules, proteins, carbohydrates, fats FROM meals WHERE date = $1 AND is_latest = TRUE AND canteen = ANY($2)",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "canteen",
"type_info": "Text"
},
{
"ordinal": 1,
"name": "name",
"type_info": "Text"
},
{
"ordinal": 2,
"name": "image_src",
"type_info": "Text"
},
{
"ordinal": 3,
"name": "price_students",
"type_info": "Numeric"
},
{
"ordinal": 4,
"name": "price_employees",
"type_info": "Numeric"
},
{
"ordinal": 5,
"name": "price_guests",
"type_info": "Numeric"
},
{
"ordinal": 6,
"name": "vegetarian",
"type_info": "Bool"
},
{
"ordinal": 7,
"name": "vegan",
"type_info": "Bool"
},
{
"ordinal": 8,
"name": "dish_type: DishType",
"type_info": {
"Custom": {
"name": "dish_type_enum",
"kind": {
"Enum": [
"main",
"side",
"dessert"
]
}
}
}
},
{
"ordinal": 9,
"name": "kjoules",
"type_info": "Int4"
},
{
"ordinal": 10,
"name": "proteins",
"type_info": "Numeric"
},
{
"ordinal": 11,
"name": "carbohydrates",
"type_info": "Numeric"
},
{
"ordinal": 12,
"name": "fats",
"type_info": "Numeric"
}
],
"parameters": {
"Left": [
"Date",
"TextArray"
]
},
"nullable": [
false,
false,
true,
false,
false,
false,
false,
false,
false,
true,
true,
true,
true
]
},
"hash": "87707bff13b4ce6ff47d2f79ee5d40b677042a20c217acc347ecdd04ebf3e6e0"
}

View File

@ -1,11 +1,12 @@
use std::sync::LazyLock;
use itertools::Itertools;
use num_bigint::BigInt;
use scraper::{ElementRef, Selector};
use shared::DishType;
use sqlx::types::BigDecimal;
use crate::util::normalize_price_bigdecimal;
static IMG_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".img img").expect("Failed to parse selector"));
static HTML_PRICE_SELECTOR: LazyLock<Selector> =
@ -15,19 +16,20 @@ static HTML_EXTRAS_SELECTOR: LazyLock<Selector> =
static HTML_NUTRITIONS_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".nutritions > p").expect("Failed to parse selector"));
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Dish {
name: String,
image_src: Option<String>,
price_students: BigDecimal,
price_employees: BigDecimal,
price_guests: BigDecimal,
extras: Vec<String>,
dish_type: DishType,
pub name: String,
pub image_src: Option<String>,
pub price_students: BigDecimal,
pub price_employees: BigDecimal,
pub price_guests: BigDecimal,
pub vegetarian: bool,
pub vegan: bool,
pub dish_type: DishType,
pub nutrition_values: NutritionValues,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)]
pub struct NutritionValues {
pub kjoule: Option<i32>,
pub protein: Option<BigDecimal>,
@ -52,13 +54,10 @@ impl Dish {
self.image_src.as_deref()
}
pub fn is_vegan(&self) -> bool {
self.extras.contains(&"vegan".to_string())
self.vegan
}
pub fn is_vegetarian(&self) -> bool {
self.extras.contains(&"vegetarisch".to_string())
}
pub fn get_extras(&self) -> &[String] {
&self.extras
self.vegetarian
}
pub fn get_type(&self) -> DishType {
self.dish_type
@ -69,8 +68,9 @@ impl Dish {
&& self.price_employees == other.price_employees
&& self.price_guests == other.price_guests
&& self.price_students == other.price_students
&& self.extras.iter().sorted().collect_vec()
== self.extras.iter().sorted().collect_vec()
&& self.vegan == other.vegan
&& self.vegetarian == other.vegetarian
&& self.dish_type == other.dish_type
}
pub fn from_element(
@ -156,6 +156,8 @@ impl Dish {
NutritionValues::default()
};
let vegan = extras.contains(&"vegan".to_string());
Some(Self {
name,
image_src: img_src,
@ -171,13 +173,25 @@ impl Dish {
.iter_mut()
.find(|(price_for, _)| price_for == "Gäste")
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
extras,
vegetarian: vegan || extras.contains(&"vegetarisch".to_string()),
vegan,
dish_type,
nutrition_values,
nutrition_values: nutrition_values.normalize(),
})
}
}
impl NutritionValues {
pub fn normalize(self) -> Self {
Self {
kjoule: self.kjoule,
protein: self.protein.map(|p| p.with_prec(6).with_scale(2)),
carbs: self.carbs.map(|c| c.with_prec(6).with_scale(2)),
fat: self.fat.map(|f| f.with_prec(6).with_scale(2)),
}
}
}
impl PartialOrd for Dish {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.name.partial_cmp(&other.name)
@ -185,7 +199,13 @@ impl PartialOrd for Dish {
}
fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
s.and_then(|p| p.trim_end_matches("").replace(',', ".").parse().ok())
s.and_then(|p| {
p.trim_end_matches("")
.replace(',', ".")
.parse::<BigDecimal>()
.ok()
})
.map(normalize_price_bigdecimal)
.unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2))
}

View File

@ -10,7 +10,7 @@ pub use dish::Dish;
pub use menu::scrape_menu;
pub use refresh::check_refresh;
use shared::Canteen;
pub use util::scrape_canteens_at_days;
pub use util::scrape_canteens_at_days_and_insert;
#[derive(Debug, Clone)]
struct CustomError(String);

View File

@ -46,7 +46,7 @@ async fn main() -> Result<()> {
})
.collect::<Vec<_>>();
util::scrape_canteens_at_days(&db, &date_canteen_combinations).await?;
util::scrape_canteens_at_days_and_insert(&db, &date_canteen_combinations).await?;
tracing::info!("Finished scraping menu");

View File

@ -5,11 +5,17 @@ use std::{
};
use chrono::{NaiveDate, Utc};
use futures::{StreamExt, TryStreamExt as _};
use itertools::Itertools;
use shared::Canteen;
use shared::{Canteen, DishType};
use sqlx::QueryBuilder;
use strum::IntoEnumIterator as _;
use crate::util;
use crate::{
dish::NutritionValues,
util::{self, add_menu_to_db, normalize_price_bigdecimal},
Dish,
};
static NON_FILTERED_CANTEENS: LazyLock<Vec<Canteen>> = LazyLock::new(|| {
let all_canteens = Canteen::iter().collect::<HashSet<_>>();
@ -61,21 +67,71 @@ pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Cante
canteens_needing_refresh
);
if let Err(err) = util::scrape_canteens_at_days(
db,
&canteens_needing_refresh
let canteen_date_pairs = canteens_needing_refresh
.iter()
.map(|c| (date, *c))
.collect::<Vec<_>>(),
)
.await
{
tracing::error!("Error during refresh scrape: {}", err);
return false;
}
.collect::<Vec<_>>();
let scraped_dishes = util::scrape_canteens_at_days(&canteen_date_pairs)
.filter_map(|res| async move { res.ok() })
.flat_map(|(_, canteen, menu)| {
futures::stream::iter(menu).map(move |dish| (canteen, dish))
})
.collect::<HashSet<_>>();
let db_data = sqlx::query!(
r#"SELECT canteen, name, image_src, price_students, price_employees, price_guests, vegetarian, vegan, dish_type AS "dish_type: DishType", kjoules, proteins, carbohydrates, fats FROM meals WHERE date = $1 AND is_latest = TRUE AND canteen = ANY($2)"#,
date,
&canteens_needing_refresh
.iter()
.map(|c| c.get_identifier().to_string())
.collect::<Vec<_>>(),
).map(|r| {
(
Canteen::from_str(&r.canteen).expect("malformed db entry") ,
Dish {
name: r.name,
image_src: r.image_src,
price_students: normalize_price_bigdecimal(r.price_students),
price_employees: normalize_price_bigdecimal(r.price_employees),
price_guests: normalize_price_bigdecimal(r.price_guests),
vegetarian: r.vegetarian,
vegan: r.vegan,
dish_type: r.dish_type,
nutrition_values: NutritionValues {
kjoule: r.kjoules,
protein: r.proteins,
carbs: r.carbohydrates,
fat: r.fats,
}.normalize(),
}
)
}).fetch(db).try_collect::<HashSet<_>>();
let (scraped_dishes, db_data) = futures::join!(scraped_dishes, db_data);
match db_data {
Ok(db_dishes) => {
let stale_dishes = db_dishes
.difference(&scraped_dishes)
.collect::<HashSet<_>>();
let new_dishes = scraped_dishes
.difference(&db_dishes)
.collect::<HashSet<_>>();
if let Err(err) = update_stale_dishes(db, date, &stale_dishes, &new_dishes).await {
tracing::error!("Error updating stale dishes in db: {}", err);
false
} else {
true
}
}
Err(err) => {
tracing::error!("Error fetching existing dishes from db: {}", err);
false
}
}
}
}
fn needs_refresh(last_refreshed: chrono::DateTime<Utc>, date_entry: chrono::NaiveDate) -> bool {
@ -89,3 +145,44 @@ fn needs_refresh(last_refreshed: chrono::DateTime<Utc>, date_entry: chrono::Naiv
now.signed_duration_since(last_refreshed) >= chrono::Duration::days(2)
}
}
async fn update_stale_dishes(
db: &sqlx::PgPool,
date: NaiveDate,
stale_dishes: &HashSet<&(Canteen, Dish)>,
new_dishes: &HashSet<&(Canteen, Dish)>,
) -> Result<(), sqlx::Error> {
let mut tx = db.begin().await?;
QueryBuilder::new("UPDATE meals SET is_latest = FALSE WHERE date = ")
.push_bind(date)
.push(r#" AND ("name", canteen) IN "#)
.push_tuples(stale_dishes, |mut sep, (canteen, dish)| {
sep.push_bind(&dish.name)
.push_bind(canteen.get_identifier());
})
.push(";")
.build()
.execute(&mut *tx)
.await?;
let chunks = new_dishes
.iter()
.sorted_by_key(|(c, _)| c)
.chunk_by(|(c, _)| c);
let new_dishes_iter = chunks.into_iter().map(|(canteen, g)| {
(
*canteen,
g.map(|(_, dish)| dish).cloned().collect::<Vec<_>>(),
)
});
for (canteen, menu) in new_dishes_iter {
add_menu_to_db(&mut tx, &date, canteen, menu).await?;
}
tx.commit().await?;
Ok(())
}

View File

@ -2,9 +2,9 @@ use std::env;
use anyhow::Result;
use chrono::NaiveDate;
use futures::StreamExt as _;
use futures::{Stream, StreamExt as _};
use shared::{Canteen, DishType};
use sqlx::{postgres::PgPoolOptions, PgPool, PgTransaction};
use sqlx::{postgres::PgPoolOptions, types::BigDecimal, PgPool, PgTransaction};
use crate::{scrape_menu, Dish};
@ -13,7 +13,7 @@ pub fn get_db() -> Result<PgPool> {
.connect_lazy(&env::var("DATABASE_URL").expect("missing DATABASE_URL env variable"))?)
}
pub async fn scrape_canteens_at_days(
pub async fn scrape_canteens_at_days_and_insert(
db: &PgPool,
date_canteen_combinations: &[(NaiveDate, Canteen)],
) -> Result<()> {
@ -40,26 +40,45 @@ pub async fn scrape_canteens_at_days(
transaction.commit().await
});
futures::stream::iter(date_canteen_combinations)
.then(|(date, canteen)| async move { (*date, *canteen, scrape_menu(date, *canteen).await) })
.filter_map(
|(date, canteen, menu)| async move { menu.ok().map(|menu| (date, canteen, menu)) },
)
.for_each(|(date, canteen, menu)| {
let errs = scrape_canteens_at_days(date_canteen_combinations)
.then(|res| {
let tx = tx.clone();
async move {
match res {
Ok((date, canteen, menu)) => {
tx.send((date, canteen, menu)).await.ok();
Ok(())
}
Err(err) => {
tracing::error!("Error scraping menu: {err}");
Err(err)
}
}
}
})
.collect::<Vec<_>>()
.await;
drop(tx);
insert_handle.await??;
if let Some(err) = errs.into_iter().find_map(Result::err) {
return Err(err);
}
Ok(())
}
pub fn scrape_canteens_at_days<'a>(
date_canteen_combinations: &'a [(NaiveDate, Canteen)],
) -> impl Stream<Item = Result<(NaiveDate, Canteen, Vec<Dish>)>> + 'a {
futures::stream::iter(date_canteen_combinations).then(|(date, canteen)| async move {
scrape_menu(date, *canteen)
.await
.map(|menu| (*date, *canteen, menu))
})
}
pub async fn add_menu_to_db(
db: &mut PgTransaction<'_>,
date: &NaiveDate,
@ -107,3 +126,7 @@ pub async fn add_menu_to_db(
Ok(())
}
pub fn normalize_price_bigdecimal(price: BigDecimal) -> BigDecimal {
price.with_prec(6).with_scale(2)
}

View File

@ -3,7 +3,7 @@ use std::fmt::Display;
mod canteen;
pub use canteen::Canteen;
#[derive(Debug, Clone, Copy, PartialEq, Eq, sqlx::Type)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, sqlx::Type)]
#[sqlx(type_name = "dish_type_enum")]
#[sqlx(rename_all = "lowercase")]
pub enum DishType {