refresh canteens with missing entries

This commit is contained in:
Moritz Hölting 2025-12-17 12:52:03 +01:00
parent 340258e461
commit 83026cfcac
6 changed files with 53 additions and 19 deletions

View File

@ -1,5 +1,6 @@
FROM rust:latest AS chef FROM rust:latest AS chef
RUN cargo install cargo-chef RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash
RUN cargo binstall cargo-chef -y
WORKDIR /app WORKDIR /app
FROM chef AS planner FROM chef AS planner

View File

@ -4,11 +4,12 @@ mod menu;
mod refresh; mod refresh;
pub mod util; pub mod util;
use std::{error::Error, fmt::Display}; use std::{collections::HashSet, error::Error, fmt::Display, sync::LazyLock};
pub use dish::Dish; pub use dish::Dish;
pub use menu::scrape_menu; pub use menu::scrape_menu;
pub use refresh::check_refresh; pub use refresh::check_refresh;
use shared::Canteen;
pub use util::scrape_canteens_at_days; pub use util::scrape_canteens_at_days;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@ -33,3 +34,14 @@ impl From<String> for CustomError {
CustomError(s) CustomError(s)
} }
} }
pub static FILTER_CANTEENS: LazyLock<HashSet<Canteen>> = LazyLock::new(|| {
std::env::var("FILTER_CANTEENS")
.ok()
.map(|s| {
s.split(',')
.filter_map(|el| el.parse::<Canteen>().ok())
.collect::<HashSet<_>>()
})
.unwrap_or_default()
});

View File

@ -1,9 +1,9 @@
use std::{collections::HashSet, env}; use std::collections::HashSet;
use anyhow::Result; use anyhow::Result;
use chrono::{Duration, Utc}; use chrono::{Duration, Utc};
use itertools::Itertools as _; use itertools::Itertools as _;
use mensa_upb_scraper::util; use mensa_upb_scraper::{util, FILTER_CANTEENS};
use shared::Canteen; use shared::Canteen;
use strum::IntoEnumIterator as _; use strum::IntoEnumIterator as _;
@ -38,20 +38,11 @@ async fn main() -> Result<()> {
}) })
.collect::<HashSet<_>>(); .collect::<HashSet<_>>();
let filter_canteens = env::var("FILTER_CANTEENS") let date_canteen_combinations = (0..7)
.ok()
.map(|s| {
s.split(',')
.filter_map(|el| el.parse::<Canteen>().ok())
.collect::<HashSet<_>>()
})
.unwrap_or_default();
let date_canteen_combinations = (0..1)
.map(|d| (Utc::now() + Duration::days(d)).date_naive()) .map(|d| (Utc::now() + Duration::days(d)).date_naive())
.cartesian_product(Canteen::iter()) .cartesian_product(Canteen::iter())
.filter(|entry @ (_, canteen)| { .filter(|entry @ (_, canteen)| {
!filter_canteens.contains(canteen) && !already_scraped.contains(entry) !FILTER_CANTEENS.contains(canteen) && !already_scraped.contains(entry)
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();

View File

@ -1,11 +1,32 @@
use std::{collections::BTreeSet, str::FromStr}; use std::{
collections::{BTreeSet, HashSet},
str::FromStr,
sync::LazyLock,
};
use chrono::{NaiveDate, Utc}; use chrono::{NaiveDate, Utc};
use itertools::Itertools;
use shared::Canteen; use shared::Canteen;
use strum::IntoEnumIterator as _;
use crate::util; use crate::util;
static NON_FILTERED_CANTEENS: LazyLock<Vec<Canteen>> = LazyLock::new(|| {
let all_canteens = Canteen::iter().collect::<HashSet<_>>();
all_canteens
.difference(&super::FILTER_CANTEENS)
.cloned()
.collect::<Vec<_>>()
});
#[tracing::instrument(skip(db))]
pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool { pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool {
if date > Utc::now().date_naive() + chrono::Duration::days(7) {
tracing::debug!("Not refreshing menu for date {date} as it is too far in the future");
return false;
}
let canteens_needing_refresh = match sqlx::query!( let canteens_needing_refresh = match sqlx::query!(
r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#, r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#,
&canteens &canteens
@ -17,7 +38,14 @@ pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Cante
.fetch_all(db) .fetch_all(db)
.await .await
{ {
Ok(v) => v.iter().filter_map(|r| if needs_refresh(r.scraped_at, date) { Some(Canteen::from_str(&r.canteen).expect("malformed db canteen entry")) } else { None }).collect::<BTreeSet<_>>(), Ok(v) => v
.iter()
.map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at)))
.chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None)))
.unique_by(|(c, _)| *c)
.filter(|(_, scraped_at)| scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date)))
.map(|(c, _)| c)
.collect::<BTreeSet<_>>(),
Err(err) => { Err(err) => {
tracing::error!("Error checking for existing scrapes: {}", err); tracing::error!("Error checking for existing scrapes: {}", err);
return false; return false;

View File

@ -1,6 +1,7 @@
FROM rust:latest AS chef FROM rust:latest AS chef
RUN cargo install cargo-chef RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash
RUN cargo binstall cargo-chef -y
WORKDIR /app WORKDIR /app
FROM chef AS planner FROM chef AS planner

View File

@ -19,7 +19,8 @@ async fn main() -> Result<()> {
.with_default_directive(LevelFilter::WARN.into()) .with_default_directive(LevelFilter::WARN.into())
.from_env() .from_env()
.expect("Invalid filter") .expect("Invalid filter")
.add_directive("mensa_upb_api=debug".parse().unwrap()); .add_directive("mensa_upb_api=debug".parse().unwrap())
.add_directive("mensa_upb_scraper=debug".parse().unwrap());
tracing_subscriber::fmt().with_env_filter(env_filter).init(); tracing_subscriber::fmt().with_env_filter(env_filter).init();
match dotenvy::dotenv() { match dotenvy::dotenv() {