From 83026cfcaca5eca0ad47ffb4de67422f49f0f6f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20H=C3=B6lting?= <87192362+moritz-hoelting@users.noreply.github.com> Date: Wed, 17 Dec 2025 12:52:03 +0100 Subject: [PATCH] refresh canteens with missing entries --- scraper/Dockerfile | 3 ++- scraper/src/lib.rs | 14 +++++++++++++- scraper/src/main.rs | 17 ++++------------- scraper/src/refresh.rs | 32 ++++++++++++++++++++++++++++++-- web-api/Dockerfile | 3 ++- web-api/src/main.rs | 3 ++- 6 files changed, 53 insertions(+), 19 deletions(-) diff --git a/scraper/Dockerfile b/scraper/Dockerfile index a920265..0121504 100644 --- a/scraper/Dockerfile +++ b/scraper/Dockerfile @@ -1,5 +1,6 @@ FROM rust:latest AS chef -RUN cargo install cargo-chef +RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash +RUN cargo binstall cargo-chef -y WORKDIR /app FROM chef AS planner diff --git a/scraper/src/lib.rs b/scraper/src/lib.rs index ed57f22..05878f5 100644 --- a/scraper/src/lib.rs +++ b/scraper/src/lib.rs @@ -4,11 +4,12 @@ mod menu; mod refresh; pub mod util; -use std::{error::Error, fmt::Display}; +use std::{collections::HashSet, error::Error, fmt::Display, sync::LazyLock}; pub use dish::Dish; pub use menu::scrape_menu; pub use refresh::check_refresh; +use shared::Canteen; pub use util::scrape_canteens_at_days; #[derive(Debug, Clone)] @@ -33,3 +34,14 @@ impl From for CustomError { CustomError(s) } } + +pub static FILTER_CANTEENS: LazyLock> = LazyLock::new(|| { + std::env::var("FILTER_CANTEENS") + .ok() + .map(|s| { + s.split(',') + .filter_map(|el| el.parse::().ok()) + .collect::>() + }) + .unwrap_or_default() +}); diff --git a/scraper/src/main.rs b/scraper/src/main.rs index 4d5effc..b1339be 100644 --- a/scraper/src/main.rs +++ b/scraper/src/main.rs @@ -1,9 +1,9 @@ -use std::{collections::HashSet, env}; +use std::collections::HashSet; use anyhow::Result; use chrono::{Duration, Utc}; use itertools::Itertools as _; -use mensa_upb_scraper::util; +use mensa_upb_scraper::{util, FILTER_CANTEENS}; use shared::Canteen; use strum::IntoEnumIterator as _; @@ -38,20 +38,11 @@ async fn main() -> Result<()> { }) .collect::>(); - let filter_canteens = env::var("FILTER_CANTEENS") - .ok() - .map(|s| { - s.split(',') - .filter_map(|el| el.parse::().ok()) - .collect::>() - }) - .unwrap_or_default(); - - let date_canteen_combinations = (0..1) + let date_canteen_combinations = (0..7) .map(|d| (Utc::now() + Duration::days(d)).date_naive()) .cartesian_product(Canteen::iter()) .filter(|entry @ (_, canteen)| { - !filter_canteens.contains(canteen) && !already_scraped.contains(entry) + !FILTER_CANTEENS.contains(canteen) && !already_scraped.contains(entry) }) .collect::>(); diff --git a/scraper/src/refresh.rs b/scraper/src/refresh.rs index e9560ea..0d2ae0d 100644 --- a/scraper/src/refresh.rs +++ b/scraper/src/refresh.rs @@ -1,11 +1,32 @@ -use std::{collections::BTreeSet, str::FromStr}; +use std::{ + collections::{BTreeSet, HashSet}, + str::FromStr, + sync::LazyLock, +}; use chrono::{NaiveDate, Utc}; +use itertools::Itertools; use shared::Canteen; +use strum::IntoEnumIterator as _; use crate::util; +static NON_FILTERED_CANTEENS: LazyLock> = LazyLock::new(|| { + let all_canteens = Canteen::iter().collect::>(); + + all_canteens + .difference(&super::FILTER_CANTEENS) + .cloned() + .collect::>() +}); + +#[tracing::instrument(skip(db))] pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool { + if date > Utc::now().date_naive() + chrono::Duration::days(7) { + tracing::debug!("Not refreshing menu for date {date} as it is too far in the future"); + return false; + } + let canteens_needing_refresh = match sqlx::query!( r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#, &canteens @@ -17,7 +38,14 @@ pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Cante .fetch_all(db) .await { - Ok(v) => v.iter().filter_map(|r| if needs_refresh(r.scraped_at, date) { Some(Canteen::from_str(&r.canteen).expect("malformed db canteen entry")) } else { None }).collect::>(), + Ok(v) => v + .iter() + .map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at))) + .chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None))) + .unique_by(|(c, _)| *c) + .filter(|(_, scraped_at)| scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date))) + .map(|(c, _)| c) + .collect::>(), Err(err) => { tracing::error!("Error checking for existing scrapes: {}", err); return false; diff --git a/web-api/Dockerfile b/web-api/Dockerfile index 561d261..b5eb280 100644 --- a/web-api/Dockerfile +++ b/web-api/Dockerfile @@ -1,6 +1,7 @@ FROM rust:latest AS chef -RUN cargo install cargo-chef +RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash +RUN cargo binstall cargo-chef -y WORKDIR /app FROM chef AS planner diff --git a/web-api/src/main.rs b/web-api/src/main.rs index 90f761b..8c5f4c5 100644 --- a/web-api/src/main.rs +++ b/web-api/src/main.rs @@ -19,7 +19,8 @@ async fn main() -> Result<()> { .with_default_directive(LevelFilter::WARN.into()) .from_env() .expect("Invalid filter") - .add_directive("mensa_upb_api=debug".parse().unwrap()); + .add_directive("mensa_upb_api=debug".parse().unwrap()) + .add_directive("mensa_upb_scraper=debug".parse().unwrap()); tracing_subscriber::fmt().with_env_filter(env_filter).init(); match dotenvy::dotenv() {