add scraper cli using clap

This commit is contained in:
Moritz Hölting 2026-01-19 10:10:38 +01:00
parent 146faa015b
commit e45daf2971
7 changed files with 228 additions and 36 deletions

121
Cargo.lock generated
View File

@ -269,6 +269,56 @@ dependencies = [
"libc",
]
[[package]]
name = "anstream"
version = "0.6.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
[[package]]
name = "anstyle-parse"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys 0.61.2",
]
[[package]]
name = "anyhow"
version = "1.0.100"
@ -475,6 +525,52 @@ dependencies = [
"windows-link",
]
[[package]]
name = "clap"
version = "4.5.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.113",
]
[[package]]
name = "clap_lex"
version = "0.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32"
[[package]]
name = "colorchoice"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]]
name = "concurrent-queue"
version = "2.5.0"
@ -1443,6 +1539,12 @@ dependencies = [
"serde",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "itertools"
version = "0.14.0"
@ -1644,6 +1746,7 @@ version = "0.2.1"
dependencies = [
"anyhow",
"chrono",
"clap",
"const_format",
"dotenvy",
"futures",
@ -1765,6 +1868,12 @@ version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "once_cell_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "parking"
version = "2.2.1"
@ -2885,6 +2994,12 @@ dependencies = [
"unicode-properties",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum"
version = "0.27.2"
@ -3355,6 +3470,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "utoipa"
version = "5.4.0"

View File

@ -32,7 +32,8 @@ RUN cargo chef cook --release --recipe-path recipe.json
COPY . .
RUN OFFLINE=true cargo build --release \
--bin mensa-upb-api \
--bin mensa-upb-scraper
--bin mensa-upb-scraper \
--bin scraper-cli
# =====================================================
# Runtime image: scraper (cron-based)
@ -47,6 +48,7 @@ RUN echo "0 0/8 * * * /app/mensa-upb-scraper >> /var/log/cron.log 2>&1" \
touch /var/log/cron.log
COPY --from=builder /app/target/release/mensa-upb-scraper /app/mensa-upb-scraper
COPY --from=builder /app/target/release/scraper-cli /app/scraper-cli
ENTRYPOINT ["/sbin/tini", "--"]
CMD sh -c 'env > /etc/environment && crond -l 2 && tail -f /var/log/cron.log'

View File

@ -9,9 +9,14 @@ version = "0.2.1"
edition = "2024"
publish = false
[[bin]]
name = "scraper-cli"
path = "src/bin/cli.rs"
[dependencies]
anyhow = { workspace = true }
chrono = { workspace = true }
clap = { version = "4.5.54", features = ["derive", "env"] }
const_format = "0.2.33"
dotenvy = { workspace = true }
futures = { workspace = true }

60
scraper/src/bin/cli.rs Normal file
View File

@ -0,0 +1,60 @@
use anyhow::Result;
use clap::Parser;
use futures::future;
use mensa_upb_scraper::check_refresh;
use sqlx::postgres::PgPoolOptions;
use strum::IntoEnumIterator as _;
use tracing::level_filters::LevelFilter;
use tracing_subscriber::EnvFilter;
#[derive(Debug, Clone, clap::Parser)]
struct Cli {
/// Database connection string
#[clap(env = "DATABASE_URL")]
database: String,
/// Canteen to scrape
#[clap(short, long = "canteen")]
canteens: Vec<shared::Canteen>,
/// Date to scrape (YYYY-MM-DD)
#[clap(short, long = "date", required = true)]
dates: Vec<chrono::NaiveDate>,
/// Force refresh even if not needed
#[clap(short, long)]
force: bool,
}
#[tokio::main]
async fn main() -> Result<()> {
dotenvy::dotenv().ok();
let mut cli = Cli::parse();
if cli.canteens.is_empty() {
cli.canteens = shared::Canteen::iter().collect();
}
let db = PgPoolOptions::new().connect_lazy(&cli.database)?;
let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::WARN.into())
.from_env()
.expect("Invalid filter")
.add_directive("mensa_upb_scraper=debug".parse().unwrap());
tracing_subscriber::fmt().with_env_filter(env_filter).init();
sqlx::migrate!("../migrations").run(&db).await?;
tracing::info!("Starting up...");
let handles = cli.dates.into_iter().map(|date| {
let db = db.clone();
let canteens = cli.canteens.clone();
tokio::spawn(async move { check_refresh(&db, date, &canteens, cli.force).await })
});
future::join_all(handles).await;
tracing::info!("Finished scraping menu");
Ok(())
}

View File

@ -2,8 +2,8 @@ use std::sync::LazyLock;
use anyhow::Result;
use chrono::{Duration, Utc};
use futures::{future, StreamExt};
use mensa_upb_scraper::{check_refresh, util, FILTER_CANTEENS};
use futures::future;
use mensa_upb_scraper::{FILTER_CANTEENS, check_refresh, util};
use shared::Canteen;
use strum::IntoEnumIterator as _;
use tracing::level_filters::LevelFilter;
@ -36,17 +36,11 @@ async fn main() -> Result<()> {
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
.map(|date| {
let db = db.clone();
tokio::spawn(async move { check_refresh(&db, date, &CANTEENS).await })
tokio::spawn(async move { check_refresh(&db, date, &CANTEENS, false).await })
});
future::join_all(handles).await;
futures::stream::iter((0..7).map(|d| (Utc::now() + Duration::days(d)).date_naive()))
.for_each_concurrent(None, async |date| {
check_refresh(&db, date, &CANTEENS).await;
})
.await;
tracing::info!("Finished scraping menu");
Ok(())

View File

@ -27,18 +27,26 @@ static NON_FILTERED_CANTEENS: LazyLock<Vec<Canteen>> = LazyLock::new(|| {
});
#[tracing::instrument(skip(db))]
pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool {
if date > Utc::now().date_naive() + chrono::Duration::days(31) {
pub async fn check_refresh(
db: &sqlx::PgPool,
date: NaiveDate,
canteens: &[Canteen],
force: bool,
) -> bool {
if !force && date > Utc::now().date_naive() + chrono::Duration::days(31) {
tracing::debug!("Not refreshing menu for date {date} as it is too far in the future");
return false;
}
if date < Utc::now().date_naive() {
if !force && date < Utc::now().date_naive() {
tracing::trace!("Not refreshing menu for date {date} as it is in the past");
return false;
}
let canteens_needing_refresh = match sqlx::query!(
let canteens_needing_refresh = if force {
canteens.iter().cloned().collect::<BTreeSet<_>>()
} else {
match sqlx::query!(
r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#,
&canteens
.iter()
@ -54,13 +62,15 @@ pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Cante
.map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at)))
.chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None)))
.unique_by(|(c, _)| *c)
.filter(|(_, scraped_at)| scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date)))
.filter(|(c, scraped_at)|
canteens.contains(c) && scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date)))
.map(|(c, _)| c)
.collect::<BTreeSet<_>>(),
Err(err) => {
tracing::error!("Error checking for existing scrapes: {}", err);
return false;
}
}
};
if canteens_needing_refresh.is_empty() {

View File

@ -28,7 +28,7 @@ impl Menu {
.collect::<Vec<_>>();
if allow_refresh {
check_refresh(db, date, canteens).await;
check_refresh(db, date, canteens, false).await;
};
let result = sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian