add scraper cli using clap
This commit is contained in:
parent
146faa015b
commit
e45daf2971
|
|
@ -269,6 +269,56 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstream"
|
||||||
|
version = "0.6.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"anstyle-parse",
|
||||||
|
"anstyle-query",
|
||||||
|
"anstyle-wincon",
|
||||||
|
"colorchoice",
|
||||||
|
"is_terminal_polyfill",
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle"
|
||||||
|
version = "1.0.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-parse"
|
||||||
|
version = "0.2.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
|
||||||
|
dependencies = [
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-query"
|
||||||
|
version = "1.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-wincon"
|
||||||
|
version = "3.0.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"once_cell_polyfill",
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.100"
|
version = "1.0.100"
|
||||||
|
|
@ -475,6 +525,52 @@ dependencies = [
|
||||||
"windows-link",
|
"windows-link",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap"
|
||||||
|
version = "4.5.54"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394"
|
||||||
|
dependencies = [
|
||||||
|
"clap_builder",
|
||||||
|
"clap_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_builder"
|
||||||
|
version = "4.5.54"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00"
|
||||||
|
dependencies = [
|
||||||
|
"anstream",
|
||||||
|
"anstyle",
|
||||||
|
"clap_lex",
|
||||||
|
"strsim",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_derive"
|
||||||
|
version = "4.5.49"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.113",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_lex"
|
||||||
|
version = "0.7.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorchoice"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "concurrent-queue"
|
name = "concurrent-queue"
|
||||||
version = "2.5.0"
|
version = "2.5.0"
|
||||||
|
|
@ -1443,6 +1539,12 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "is_terminal_polyfill"
|
||||||
|
version = "1.70.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itertools"
|
name = "itertools"
|
||||||
version = "0.14.0"
|
version = "0.14.0"
|
||||||
|
|
@ -1644,6 +1746,7 @@ version = "0.2.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"clap",
|
||||||
"const_format",
|
"const_format",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"futures",
|
"futures",
|
||||||
|
|
@ -1765,6 +1868,12 @@ version = "1.21.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "once_cell_polyfill"
|
||||||
|
version = "1.70.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking"
|
name = "parking"
|
||||||
version = "2.2.1"
|
version = "2.2.1"
|
||||||
|
|
@ -2885,6 +2994,12 @@ dependencies = [
|
||||||
"unicode-properties",
|
"unicode-properties",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strum"
|
name = "strum"
|
||||||
version = "0.27.2"
|
version = "0.27.2"
|
||||||
|
|
@ -3355,6 +3470,12 @@ version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8parse"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utoipa"
|
name = "utoipa"
|
||||||
version = "5.4.0"
|
version = "5.4.0"
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,8 @@ RUN cargo chef cook --release --recipe-path recipe.json
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN OFFLINE=true cargo build --release \
|
RUN OFFLINE=true cargo build --release \
|
||||||
--bin mensa-upb-api \
|
--bin mensa-upb-api \
|
||||||
--bin mensa-upb-scraper
|
--bin mensa-upb-scraper \
|
||||||
|
--bin scraper-cli
|
||||||
|
|
||||||
# =====================================================
|
# =====================================================
|
||||||
# Runtime image: scraper (cron-based)
|
# Runtime image: scraper (cron-based)
|
||||||
|
|
@ -47,6 +48,7 @@ RUN echo "0 0/8 * * * /app/mensa-upb-scraper >> /var/log/cron.log 2>&1" \
|
||||||
touch /var/log/cron.log
|
touch /var/log/cron.log
|
||||||
|
|
||||||
COPY --from=builder /app/target/release/mensa-upb-scraper /app/mensa-upb-scraper
|
COPY --from=builder /app/target/release/mensa-upb-scraper /app/mensa-upb-scraper
|
||||||
|
COPY --from=builder /app/target/release/scraper-cli /app/scraper-cli
|
||||||
|
|
||||||
ENTRYPOINT ["/sbin/tini", "--"]
|
ENTRYPOINT ["/sbin/tini", "--"]
|
||||||
CMD sh -c 'env > /etc/environment && crond -l 2 && tail -f /var/log/cron.log'
|
CMD sh -c 'env > /etc/environment && crond -l 2 && tail -f /var/log/cron.log'
|
||||||
|
|
|
||||||
|
|
@ -9,9 +9,14 @@ version = "0.2.1"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
publish = false
|
publish = false
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "scraper-cli"
|
||||||
|
path = "src/bin/cli.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
chrono = { workspace = true }
|
chrono = { workspace = true }
|
||||||
|
clap = { version = "4.5.54", features = ["derive", "env"] }
|
||||||
const_format = "0.2.33"
|
const_format = "0.2.33"
|
||||||
dotenvy = { workspace = true }
|
dotenvy = { workspace = true }
|
||||||
futures = { workspace = true }
|
futures = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use clap::Parser;
|
||||||
|
use futures::future;
|
||||||
|
use mensa_upb_scraper::check_refresh;
|
||||||
|
use sqlx::postgres::PgPoolOptions;
|
||||||
|
use strum::IntoEnumIterator as _;
|
||||||
|
use tracing::level_filters::LevelFilter;
|
||||||
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, clap::Parser)]
|
||||||
|
struct Cli {
|
||||||
|
/// Database connection string
|
||||||
|
#[clap(env = "DATABASE_URL")]
|
||||||
|
database: String,
|
||||||
|
/// Canteen to scrape
|
||||||
|
#[clap(short, long = "canteen")]
|
||||||
|
canteens: Vec<shared::Canteen>,
|
||||||
|
/// Date to scrape (YYYY-MM-DD)
|
||||||
|
#[clap(short, long = "date", required = true)]
|
||||||
|
dates: Vec<chrono::NaiveDate>,
|
||||||
|
/// Force refresh even if not needed
|
||||||
|
#[clap(short, long)]
|
||||||
|
force: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
dotenvy::dotenv().ok();
|
||||||
|
|
||||||
|
let mut cli = Cli::parse();
|
||||||
|
|
||||||
|
if cli.canteens.is_empty() {
|
||||||
|
cli.canteens = shared::Canteen::iter().collect();
|
||||||
|
}
|
||||||
|
|
||||||
|
let db = PgPoolOptions::new().connect_lazy(&cli.database)?;
|
||||||
|
|
||||||
|
let env_filter = EnvFilter::builder()
|
||||||
|
.with_default_directive(LevelFilter::WARN.into())
|
||||||
|
.from_env()
|
||||||
|
.expect("Invalid filter")
|
||||||
|
.add_directive("mensa_upb_scraper=debug".parse().unwrap());
|
||||||
|
tracing_subscriber::fmt().with_env_filter(env_filter).init();
|
||||||
|
|
||||||
|
sqlx::migrate!("../migrations").run(&db).await?;
|
||||||
|
|
||||||
|
tracing::info!("Starting up...");
|
||||||
|
|
||||||
|
let handles = cli.dates.into_iter().map(|date| {
|
||||||
|
let db = db.clone();
|
||||||
|
let canteens = cli.canteens.clone();
|
||||||
|
tokio::spawn(async move { check_refresh(&db, date, &canteens, cli.force).await })
|
||||||
|
});
|
||||||
|
|
||||||
|
future::join_all(handles).await;
|
||||||
|
|
||||||
|
tracing::info!("Finished scraping menu");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -2,8 +2,8 @@ use std::sync::LazyLock;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use chrono::{Duration, Utc};
|
use chrono::{Duration, Utc};
|
||||||
use futures::{future, StreamExt};
|
use futures::future;
|
||||||
use mensa_upb_scraper::{check_refresh, util, FILTER_CANTEENS};
|
use mensa_upb_scraper::{FILTER_CANTEENS, check_refresh, util};
|
||||||
use shared::Canteen;
|
use shared::Canteen;
|
||||||
use strum::IntoEnumIterator as _;
|
use strum::IntoEnumIterator as _;
|
||||||
use tracing::level_filters::LevelFilter;
|
use tracing::level_filters::LevelFilter;
|
||||||
|
|
@ -36,17 +36,11 @@ async fn main() -> Result<()> {
|
||||||
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
|
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
|
||||||
.map(|date| {
|
.map(|date| {
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
tokio::spawn(async move { check_refresh(&db, date, &CANTEENS).await })
|
tokio::spawn(async move { check_refresh(&db, date, &CANTEENS, false).await })
|
||||||
});
|
});
|
||||||
|
|
||||||
future::join_all(handles).await;
|
future::join_all(handles).await;
|
||||||
|
|
||||||
futures::stream::iter((0..7).map(|d| (Utc::now() + Duration::days(d)).date_naive()))
|
|
||||||
.for_each_concurrent(None, async |date| {
|
|
||||||
check_refresh(&db, date, &CANTEENS).await;
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
|
|
||||||
tracing::info!("Finished scraping menu");
|
tracing::info!("Finished scraping menu");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
|
||||||
|
|
@ -27,39 +27,49 @@ static NON_FILTERED_CANTEENS: LazyLock<Vec<Canteen>> = LazyLock::new(|| {
|
||||||
});
|
});
|
||||||
|
|
||||||
#[tracing::instrument(skip(db))]
|
#[tracing::instrument(skip(db))]
|
||||||
pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool {
|
pub async fn check_refresh(
|
||||||
if date > Utc::now().date_naive() + chrono::Duration::days(31) {
|
db: &sqlx::PgPool,
|
||||||
|
date: NaiveDate,
|
||||||
|
canteens: &[Canteen],
|
||||||
|
force: bool,
|
||||||
|
) -> bool {
|
||||||
|
if !force && date > Utc::now().date_naive() + chrono::Duration::days(31) {
|
||||||
tracing::debug!("Not refreshing menu for date {date} as it is too far in the future");
|
tracing::debug!("Not refreshing menu for date {date} as it is too far in the future");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if date < Utc::now().date_naive() {
|
if !force && date < Utc::now().date_naive() {
|
||||||
tracing::trace!("Not refreshing menu for date {date} as it is in the past");
|
tracing::trace!("Not refreshing menu for date {date} as it is in the past");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
let canteens_needing_refresh = match sqlx::query!(
|
let canteens_needing_refresh = if force {
|
||||||
r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#,
|
canteens.iter().cloned().collect::<BTreeSet<_>>()
|
||||||
&canteens
|
} else {
|
||||||
.iter()
|
match sqlx::query!(
|
||||||
.map(|c| c.get_identifier().to_string())
|
r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#,
|
||||||
.collect::<Vec<_>>(),
|
&canteens
|
||||||
date
|
.iter()
|
||||||
)
|
.map(|c| c.get_identifier().to_string())
|
||||||
.fetch_all(db)
|
.collect::<Vec<_>>(),
|
||||||
.await
|
date
|
||||||
{
|
)
|
||||||
Ok(v) => v
|
.fetch_all(db)
|
||||||
.iter()
|
.await
|
||||||
.map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at)))
|
{
|
||||||
.chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None)))
|
Ok(v) => v
|
||||||
.unique_by(|(c, _)| *c)
|
.iter()
|
||||||
.filter(|(_, scraped_at)| scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date)))
|
.map(|r| (Canteen::from_str(&r.canteen).expect("malformed db entry"), Some(r.scraped_at)))
|
||||||
.map(|(c, _)| c)
|
.chain(NON_FILTERED_CANTEENS.iter().filter(|c| canteens.contains(c)).map(|c| (*c, None)))
|
||||||
.collect::<BTreeSet<_>>(),
|
.unique_by(|(c, _)| *c)
|
||||||
Err(err) => {
|
.filter(|(c, scraped_at)|
|
||||||
tracing::error!("Error checking for existing scrapes: {}", err);
|
canteens.contains(c) && scraped_at.is_none_or(|scraped_at| needs_refresh(scraped_at, date)))
|
||||||
return false;
|
.map(|(c, _)| c)
|
||||||
|
.collect::<BTreeSet<_>>(),
|
||||||
|
Err(err) => {
|
||||||
|
tracing::error!("Error checking for existing scrapes: {}", err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ impl Menu {
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
if allow_refresh {
|
if allow_refresh {
|
||||||
check_refresh(db, date, canteens).await;
|
check_refresh(db, date, canteens, false).await;
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian
|
let result = sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue