add scraper cli using clap

This commit is contained in:
Moritz Hölting 2026-01-19 10:10:38 +01:00
parent 146faa015b
commit e45daf2971
7 changed files with 228 additions and 36 deletions

121
Cargo.lock generated
View File

@ -269,6 +269,56 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "anstream"
version = "0.6.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
[[package]]
name = "anstyle-parse"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys 0.61.2",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.100" version = "1.0.100"
@ -475,6 +525,52 @@ dependencies = [
"windows-link", "windows-link",
] ]
[[package]]
name = "clap"
version = "4.5.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.113",
]
[[package]]
name = "clap_lex"
version = "0.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32"
[[package]]
name = "colorchoice"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]] [[package]]
name = "concurrent-queue" name = "concurrent-queue"
version = "2.5.0" version = "2.5.0"
@ -1443,6 +1539,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.14.0" version = "0.14.0"
@ -1644,6 +1746,7 @@ version = "0.2.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
"clap",
"const_format", "const_format",
"dotenvy", "dotenvy",
"futures", "futures",
@ -1765,6 +1868,12 @@ version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "once_cell_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]] [[package]]
name = "parking" name = "parking"
version = "2.2.1" version = "2.2.1"
@ -2885,6 +2994,12 @@ dependencies = [
"unicode-properties", "unicode-properties",
] ]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]] [[package]]
name = "strum" name = "strum"
version = "0.27.2" version = "0.27.2"
@ -3355,6 +3470,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]] [[package]]
name = "utoipa" name = "utoipa"
version = "5.4.0" version = "5.4.0"

View File

@ -32,7 +32,8 @@ RUN cargo chef cook --release --recipe-path recipe.json
COPY . . COPY . .
RUN OFFLINE=true cargo build --release \ RUN OFFLINE=true cargo build --release \
--bin mensa-upb-api \ --bin mensa-upb-api \
--bin mensa-upb-scraper --bin mensa-upb-scraper \
--bin scraper-cli
# ===================================================== # =====================================================
# Runtime image: scraper (cron-based) # Runtime image: scraper (cron-based)
@ -47,6 +48,7 @@ RUN echo "0 0/8 * * * /app/mensa-upb-scraper >> /var/log/cron.log 2>&1" \
touch /var/log/cron.log touch /var/log/cron.log
COPY --from=builder /app/target/release/mensa-upb-scraper /app/mensa-upb-scraper COPY --from=builder /app/target/release/mensa-upb-scraper /app/mensa-upb-scraper
COPY --from=builder /app/target/release/scraper-cli /app/scraper-cli
ENTRYPOINT ["/sbin/tini", "--"] ENTRYPOINT ["/sbin/tini", "--"]
CMD sh -c 'env > /etc/environment && crond -l 2 && tail -f /var/log/cron.log' CMD sh -c 'env > /etc/environment && crond -l 2 && tail -f /var/log/cron.log'

View File

@ -9,9 +9,14 @@ version = "0.2.1"
edition = "2024" edition = "2024"
publish = false publish = false
[[bin]]
name = "scraper-cli"
path = "src/bin/cli.rs"
[dependencies] [dependencies]
anyhow = { workspace = true } anyhow = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
clap = { version = "4.5.54", features = ["derive", "env"] }
const_format = "0.2.33" const_format = "0.2.33"
dotenvy = { workspace = true } dotenvy = { workspace = true }
futures = { workspace = true } futures = { workspace = true }

60
scraper/src/bin/cli.rs Normal file
View File

@ -0,0 +1,60 @@
use anyhow::Result;
use clap::Parser;
use futures::future;
use mensa_upb_scraper::check_refresh;
use sqlx::postgres::PgPoolOptions;
use strum::IntoEnumIterator as _;
use tracing::level_filters::LevelFilter;
use tracing_subscriber::EnvFilter;
#[derive(Debug, Clone, clap::Parser)]
struct Cli {
/// Database connection string
#[clap(env = "DATABASE_URL")]
database: String,
/// Canteen to scrape
#[clap(short, long = "canteen")]
canteens: Vec<shared::Canteen>,
/// Date to scrape (YYYY-MM-DD)
#[clap(short, long = "date", required = true)]
dates: Vec<chrono::NaiveDate>,
/// Force refresh even if not needed
#[clap(short, long)]
force: bool,
}
#[tokio::main]
async fn main() -> Result<()> {
dotenvy::dotenv().ok();
let mut cli = Cli::parse();
if cli.canteens.is_empty() {
cli.canteens = shared::Canteen::iter().collect();
}
let db = PgPoolOptions::new().connect_lazy(&cli.database)?;
let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::WARN.into())
.from_env()
.expect("Invalid filter")
.add_directive("mensa_upb_scraper=debug".parse().unwrap());
tracing_subscriber::fmt().with_env_filter(env_filter).init();
sqlx::migrate!("../migrations").run(&db).await?;
tracing::info!("Starting up...");
let handles = cli.dates.into_iter().map(|date| {
let db = db.clone();
let canteens = cli.canteens.clone();
tokio::spawn(async move { check_refresh(&db, date, &canteens, cli.force).await })
});
future::join_all(handles).await;
tracing::info!("Finished scraping menu");
Ok(())
}

View File

@ -2,8 +2,8 @@ use std::sync::LazyLock;
use anyhow::Result; use anyhow::Result;
use chrono::{Duration, Utc}; use chrono::{Duration, Utc};
use futures::{future, StreamExt}; use futures::future;
use mensa_upb_scraper::{check_refresh, util, FILTER_CANTEENS}; use mensa_upb_scraper::{FILTER_CANTEENS, check_refresh, util};
use shared::Canteen; use shared::Canteen;
use strum::IntoEnumIterator as _; use strum::IntoEnumIterator as _;
use tracing::level_filters::LevelFilter; use tracing::level_filters::LevelFilter;
@ -36,17 +36,11 @@ async fn main() -> Result<()> {
.map(|d| (Utc::now() + Duration::days(d)).date_naive()) .map(|d| (Utc::now() + Duration::days(d)).date_naive())
.map(|date| { .map(|date| {
let db = db.clone(); let db = db.clone();
tokio::spawn(async move { check_refresh(&db, date, &CANTEENS).await }) tokio::spawn(async move { check_refresh(&db, date, &CANTEENS, false).await })
}); });
future::join_all(handles).await; future::join_all(handles).await;
futures::stream::iter((0..7).map(|d| (Utc::now() + Duration::days(d)).date_naive()))
.for_each_concurrent(None, async |date| {
check_refresh(&db, date, &CANTEENS).await;
})
.await;
tracing::info!("Finished scraping menu"); tracing::info!("Finished scraping menu");
Ok(()) Ok(())

View File

@ -27,39 +27,49 @@ static NON_FILTERED_CANTEENS: LazyLock<Vec<Canteen>> = LazyLock::new(|| {
}); });
#[tracing::instrument(skip(db))] #[tracing::instrument(skip(db))]
pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool { pub async fn check_refresh(
if date > Utc::now().date_naive() + chrono::Duration::days(31) { db: &sqlx::PgPool,
date: NaiveDate,
canteens: &[Canteen],
force: bool,
) -> bool {
if !force && date > Utc::now().date_naive() + chrono::Duration::days(31) {
tracing::debug!("Not refreshing menu for date {date} as it is too far in the future"); tracing::debug!("Not refreshing menu for date {date} as it is too far in the future");
return false; return false;
} }
if date < Utc::now().date_naive() { if !force && date < Utc::now().date_naive() {
tracing::trace!("Not refreshing menu for date {date} as it is in the past"); tracing::trace!("Not refreshing menu for date {date} as it is in the past");
return false; return false;
} }
let canteens_needing_refresh = match sqlx::query!( let canteens_needing_refresh = if force {
r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#, canteens.iter().cloned().collect::<BTreeSet<_>>()
&canteens } else {
.iter() match sqlx::query!(
.map(|c| c.get_identifier().to_string()) r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#,
.collect::<Vec<_>>(), &canteens
date .iter()
) .map(|c| c.get_identifier().to_string())
.fetch_all(db) .collect::<Vec<_>>(),
.await date
{ )
Ok(v) => v .fetch_all(db)
.iter() .await
.map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at))) {
.chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None))) Ok(v) => v
.unique_by(|(c, _)| *c) .iter()
.filter(|(_, scraped_at)| scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date))) .map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at)))
.map(|(c, _)| c) .chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None)))
.collect::<BTreeSet<_>>(), .unique_by(|(c, _)| *c)
Err(err) => { .filter(|(c, scraped_at)|
tracing::error!("Error checking for existing scrapes: {}", err); canteens.contains(c) && scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date)))
return false; .map(|(c, _)| c)
.collect::<BTreeSet<_>>(),
Err(err) => {
tracing::error!("Error checking for existing scrapes: {}", err);
return false;
}
} }
}; };

View File

@ -28,7 +28,7 @@ impl Menu {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
if allow_refresh { if allow_refresh {
check_refresh(db, date, canteens).await; check_refresh(db, date, canteens, false).await;
}; };
let result = sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian let result = sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian