diff --git a/Cargo.lock b/Cargo.lock index ba914bd..3fd5a17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -269,6 +269,56 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.100" @@ -475,6 +525,52 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clap" +version = "4.5.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.113", +] + +[[package]] +name = "clap_lex" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1443,6 +1539,12 @@ dependencies = [ "serde", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.14.0" @@ -1644,6 +1746,7 @@ version = "0.2.1" dependencies = [ "anyhow", "chrono", + "clap", "const_format", "dotenvy", "futures", @@ -1765,6 +1868,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "parking" version = "2.2.1" @@ -2885,6 +2994,12 @@ dependencies = [ "unicode-properties", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.27.2" @@ -3355,6 +3470,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "utoipa" version = "5.4.0" diff --git a/Dockerfile b/Dockerfile index 5e4b43d..834704a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,8 @@ RUN cargo chef cook --release --recipe-path recipe.json COPY . . RUN OFFLINE=true cargo build --release \ --bin mensa-upb-api \ - --bin mensa-upb-scraper + --bin mensa-upb-scraper \ + --bin scraper-cli # ===================================================== # Runtime image: scraper (cron-based) @@ -47,6 +48,7 @@ RUN echo "0 0/8 * * * /app/mensa-upb-scraper >> /var/log/cron.log 2>&1" \ touch /var/log/cron.log COPY --from=builder /app/target/release/mensa-upb-scraper /app/mensa-upb-scraper +COPY --from=builder /app/target/release/scraper-cli /app/scraper-cli ENTRYPOINT ["/sbin/tini", "--"] CMD sh -c 'env > /etc/environment && crond -l 2 && tail -f /var/log/cron.log' diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index e00b8da..df1b845 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -9,9 +9,14 @@ version = "0.2.1" edition = "2024" publish = false +[[bin]] +name = "scraper-cli" +path = "src/bin/cli.rs" + [dependencies] anyhow = { workspace = true } chrono = { workspace = true } +clap = { version = "4.5.54", features = ["derive", "env"] } const_format = "0.2.33" dotenvy = { workspace = true } futures = { workspace = true } diff --git a/scraper/src/bin/cli.rs b/scraper/src/bin/cli.rs new file mode 100644 index 0000000..560a795 --- /dev/null +++ b/scraper/src/bin/cli.rs @@ -0,0 +1,60 @@ +use anyhow::Result; +use clap::Parser; +use futures::future; +use mensa_upb_scraper::check_refresh; +use sqlx::postgres::PgPoolOptions; +use strum::IntoEnumIterator as _; +use tracing::level_filters::LevelFilter; +use tracing_subscriber::EnvFilter; + +#[derive(Debug, Clone, clap::Parser)] +struct Cli { + /// Database connection string + #[clap(env = "DATABASE_URL")] + database: String, + /// Canteen to scrape + #[clap(short, long = "canteen")] + canteens: Vec, + /// Date to scrape (YYYY-MM-DD) + #[clap(short, long = "date", required = true)] + dates: Vec, + /// Force refresh even if not needed + #[clap(short, long)] + force: bool, +} + +#[tokio::main] +async fn main() -> Result<()> { + dotenvy::dotenv().ok(); + + let mut cli = Cli::parse(); + + if cli.canteens.is_empty() { + cli.canteens = shared::Canteen::iter().collect(); + } + + let db = PgPoolOptions::new().connect_lazy(&cli.database)?; + + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::WARN.into()) + .from_env() + .expect("Invalid filter") + .add_directive("mensa_upb_scraper=debug".parse().unwrap()); + tracing_subscriber::fmt().with_env_filter(env_filter).init(); + + sqlx::migrate!("../migrations").run(&db).await?; + + tracing::info!("Starting up..."); + + let handles = cli.dates.into_iter().map(|date| { + let db = db.clone(); + let canteens = cli.canteens.clone(); + tokio::spawn(async move { check_refresh(&db, date, &canteens, cli.force).await }) + }); + + future::join_all(handles).await; + + tracing::info!("Finished scraping menu"); + + Ok(()) +} diff --git a/scraper/src/main.rs b/scraper/src/main.rs index d2990fd..be1f05d 100644 --- a/scraper/src/main.rs +++ b/scraper/src/main.rs @@ -2,8 +2,8 @@ use std::sync::LazyLock; use anyhow::Result; use chrono::{Duration, Utc}; -use futures::{future, StreamExt}; -use mensa_upb_scraper::{check_refresh, util, FILTER_CANTEENS}; +use futures::future; +use mensa_upb_scraper::{FILTER_CANTEENS, check_refresh, util}; use shared::Canteen; use strum::IntoEnumIterator as _; use tracing::level_filters::LevelFilter; @@ -36,17 +36,11 @@ async fn main() -> Result<()> { .map(|d| (Utc::now() + Duration::days(d)).date_naive()) .map(|date| { let db = db.clone(); - tokio::spawn(async move { check_refresh(&db, date, &CANTEENS).await }) + tokio::spawn(async move { check_refresh(&db, date, &CANTEENS, false).await }) }); future::join_all(handles).await; - futures::stream::iter((0..7).map(|d| (Utc::now() + Duration::days(d)).date_naive())) - .for_each_concurrent(None, async |date| { - check_refresh(&db, date, &CANTEENS).await; - }) - .await; - tracing::info!("Finished scraping menu"); Ok(()) diff --git a/scraper/src/refresh.rs b/scraper/src/refresh.rs index 9556c86..1d5ac4e 100644 --- a/scraper/src/refresh.rs +++ b/scraper/src/refresh.rs @@ -27,39 +27,49 @@ static NON_FILTERED_CANTEENS: LazyLock> = LazyLock::new(|| { }); #[tracing::instrument(skip(db))] -pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool { - if date > Utc::now().date_naive() + chrono::Duration::days(31) { +pub async fn check_refresh( + db: &sqlx::PgPool, + date: NaiveDate, + canteens: &[Canteen], + force: bool, +) -> bool { + if !force && date > Utc::now().date_naive() + chrono::Duration::days(31) { tracing::debug!("Not refreshing menu for date {date} as it is too far in the future"); return false; } - if date < Utc::now().date_naive() { + if !force && date < Utc::now().date_naive() { tracing::trace!("Not refreshing menu for date {date} as it is in the past"); return false; } - let canteens_needing_refresh = match sqlx::query!( - r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#, - &canteens - .iter() - .map(|c| c.get_identifier().to_string()) - .collect::>(), - date - ) - .fetch_all(db) - .await - { - Ok(v) => v - .iter() - .map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at))) - .chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None))) - .unique_by(|(c, _)| *c) - .filter(|(_, scraped_at)| scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date))) - .map(|(c, _)| c) - .collect::>(), - Err(err) => { - tracing::error!("Error checking for existing scrapes: {}", err); - return false; + let canteens_needing_refresh = if force { + canteens.iter().cloned().collect::>() + } else { + match sqlx::query!( + r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#, + &canteens + .iter() + .map(|c| c.get_identifier().to_string()) + .collect::>(), + date + ) + .fetch_all(db) + .await + { + Ok(v) => v + .iter() + .map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at))) + .chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None))) + .unique_by(|(c, _)| *c) + .filter(|(c, scraped_at)| + canteens.contains(c) && scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date))) + .map(|(c, _)| c) + .collect::>(), + Err(err) => { + tracing::error!("Error checking for existing scrapes: {}", err); + return false; + } } }; diff --git a/web-api/src/menu.rs b/web-api/src/menu.rs index ff57da2..ebf0195 100644 --- a/web-api/src/menu.rs +++ b/web-api/src/menu.rs @@ -28,7 +28,7 @@ impl Menu { .collect::>(); if allow_refresh { - check_refresh(db, date, canteens).await; + check_refresh(db, date, canteens, false).await; }; let result = sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian