Compare commits

...

2 Commits

Author SHA1 Message Date
Moritz Hölting 8e3dd731c5 scrape nutritional values 2025-12-15 22:51:39 +01:00
Moritz Hölting 4729c1afab web api automatically refreshes outdated 2025-12-15 21:39:27 +01:00
29 changed files with 1196 additions and 799 deletions

View File

@ -0,0 +1,29 @@
{
"db_name": "PostgreSQL",
"query": "SELECT canteen, max(scraped_at) AS \"scraped_at!\" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "canteen",
"type_info": "Text"
},
{
"ordinal": 1,
"name": "scraped_at!",
"type_info": "Timestamptz"
}
],
"parameters": {
"Left": [
"TextArray",
"Date"
]
},
"nullable": [
false,
null
]
},
"hash": "2306ceee73b304c3ca88da52837ee4173631a63d3a89e6440b3334c546213863"
}

View File

@ -0,0 +1,15 @@
{
"db_name": "PostgreSQL",
"query": "INSERT INTO canteens_scraped (scraped_for, canteen) VALUES ($1, $2)",
"describe": {
"columns": [],
"parameters": {
"Left": [
"Date",
"Text"
]
},
"nullable": []
},
"hash": "474de9870fb2cbfb2cdc37004c82f42b80a311d4a00ee22b97dd1e7b5c91ad39"
}

View File

@ -1,23 +0,0 @@
{
"db_name": "PostgreSQL",
"query": "INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian)\n VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)\n ON CONFLICT (date,canteen,name) DO NOTHING",
"describe": {
"columns": [],
"parameters": {
"Left": [
"Date",
"Text",
"Text",
"Text",
"Text",
"Numeric",
"Numeric",
"Numeric",
"Bool",
"Bool"
]
},
"nullable": []
},
"hash": "4fdb615a3e155d8394c70f25d2d8946bed129746b70f92f66704f02093b2e27c"
}

View File

@ -1,11 +1,11 @@
{
"db_name": "PostgreSQL",
"query": "SELECT DISTINCT date, canteen FROM MEALS WHERE date >= $1 AND date <= $2",
"query": "SELECT DISTINCT scraped_for, canteen FROM canteens_scraped WHERE scraped_for >= $1 AND scraped_for <= $2",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "date",
"name": "scraped_for",
"type_info": "Date"
},
{
@ -25,5 +25,5 @@
false
]
},
"hash": "b94a6b49fb5e53e361da7a890dd5f62d467293454b01175939e32339ee90fd23"
"hash": "65858112433addbff921108a5b110ffead845478d359af83b70d98ff8d1945f2"
}

View File

@ -0,0 +1,15 @@
{
"db_name": "PostgreSQL",
"query": "UPDATE meals SET is_latest = FALSE WHERE date = $1 AND canteen = $2 AND is_latest = TRUE",
"describe": {
"columns": [],
"parameters": {
"Left": [
"Date",
"Text"
]
},
"nullable": []
},
"hash": "f804f9c634a34945d7aa0cd3162b20ff9f1ff928912d871a708a088f2d011ba7"
}

View File

@ -1,6 +1,6 @@
{
"db_name": "PostgreSQL",
"query": "SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS canteens, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian \n FROM meals WHERE date = $1 AND canteen = ANY($2) \n GROUP BY name, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian\n ORDER BY name",
"query": "SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS \"canteens!\", dish_type AS \"dish_type: DishType\", image_src, price_students, price_employees, price_guests, vegan, vegetarian \n FROM meals WHERE date = $1 AND canteen = ANY($2) AND is_latest = TRUE\n GROUP BY name, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian\n ORDER BY name",
"describe": {
"columns": [
{
@ -10,13 +10,24 @@
},
{
"ordinal": 1,
"name": "canteens",
"name": "canteens!",
"type_info": "TextArray"
},
{
"ordinal": 2,
"name": "dish_type",
"type_info": "Text"
"name": "dish_type: DishType",
"type_info": {
"Custom": {
"name": "dish_type_enum",
"kind": {
"Enum": [
"main",
"side",
"dessert"
]
}
}
}
},
{
"ordinal": 3,
@ -67,5 +78,5 @@
false
]
},
"hash": "b5a990f34095b255672e81562dc905e1957d1d33d823dc82ec92b552f5092028"
"hash": "ffbe520bbd10d79f189bc4cb202fc4367d1a1ea563d1b7845ab099ef6ec1e47a"
}

1105
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@
[workspace]
members = [
"scraper",
"shared",
"web-api",
]
resolver = "2"
@ -14,12 +15,14 @@ repository = "https://github.com/moritz-hoelting/mensa-upb-api"
readme = "README.md"
[workspace.dependencies]
anyhow = "1.0.93"
chrono = "0.4.38"
anyhow = "1.0.100"
chrono = "0.4.42"
dotenvy = "0.15.7"
futures = "0.3.31"
itertools = "0.14.0"
serde = { version = "1.0.228", features = ["derive"] }
sqlx = "0.8.2"
strum = "0.27.1"
tokio = "1.46.0"
tracing = "0.1.40"
tracing-subscriber = "0.3.18"
strum = "0.27.2"
tokio = "1.48.0"
tracing = "0.1.43"
tracing-subscriber = "0.3.22"

View File

@ -0,0 +1,33 @@
-- Add down migration script here
DROP VIEW IF EXISTS meals_view;
DROP INDEX IF EXISTS idx_meals_date_canteen_latest;
DROP INDEX IF EXISTS idx_meals_refreshed_at;
DELETE FROM meals WHERE is_latest = FALSE;
ALTER TABLE meals
DROP CONSTRAINT meals_pkey;
ALTER TABLE meals
DROP COLUMN id;
ALTER TABLE meals
ADD CONSTRAINT meals_pkey PRIMARY KEY (date, canteen, name);
ALTER TABLE meals
DROP COLUMN is_latest;
ALTER TABLE meals
DROP COLUMN refreshed_at;
ALTER TABLE meals
ALTER COLUMN dish_type
TYPE TEXT
USING dish_type::TEXT;
DROP TABLE IF EXISTS canteens_scraped;
DROP TYPE IF EXISTS dish_type_enum;

View File

@ -0,0 +1,51 @@
-- Add up migration script here
CREATE TABLE canteens_scraped (
canteen TEXT NOT NULL,
scraped_for DATE NOT NULL,
scraped_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (canteen, scraped_for, scraped_at)
);
ALTER TABLE meals
ADD COLUMN id UUID NOT NULL DEFAULT gen_random_uuid();
-- Remove existing primary key constraints
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN
SELECT conname
FROM pg_constraint
WHERE contype = 'p'
AND conrelid = 'meals'::regclass
LOOP
EXECUTE format('ALTER TABLE meals DROP CONSTRAINT %I', r.conname);
END LOOP;
END $$;
ALTER TABLE meals
ADD CONSTRAINT meals_pkey PRIMARY KEY (id);
ALTER TABLE meals
ADD COLUMN is_latest BOOLEAN NOT NULL DEFAULT TRUE;
ALTER TABLE meals
ADD COLUMN refreshed_at TIMESTAMPTZ NOT NULL DEFAULT NOW();
CREATE TYPE dish_type_enum AS ENUM ('main', 'side', 'dessert');
ALTER TABLE meals
ALTER COLUMN dish_type
TYPE dish_type_enum
USING dish_type::dish_type_enum;
CREATE INDEX idx_meals_date_canteen_latest ON meals(date, canteen, is_latest);
CREATE INDEX idx_meals_refreshed_at ON meals(refreshed_at);
CREATE VIEW meals_view AS
SELECT id, date, canteen, name, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
FROM meals
WHERE is_latest = TRUE;

View File

@ -0,0 +1,15 @@
-- Add down migration script here
DROP VIEW IF EXISTS meals_view;
ALTER TABLE meals
DROP COLUMN kjoules;
ALTER TABLE meals
DROP COLUMN proteins;
ALTER TABLE meals
DROP COLUMN carbohydrates;
ALTER TABLE meals
DROP COLUMN fats;

View File

@ -0,0 +1,34 @@
-- Add up migration script here
ALTER TABLE meals
ADD COLUMN kjoules INT;
ALTER TABLE meals
ADD COLUMN proteins NUMERIC(6,2);
ALTER TABLE meals
ADD COLUMN carbohydrates NUMERIC(6,2);
ALTER TABLE meals
ADD COLUMN fats NUMERIC(6,2);
CREATE OR REPLACE VIEW meals_view AS
SELECT
id,
date,
canteen,
name,
dish_type,
image_src,
price_students,
price_employees,
price_guests,
vegan,
vegetarian,
kjoules,
proteins,
carbohydrates,
fats,
round(kjoules / 4.184) AS kcal
FROM meals
WHERE is_latest = TRUE;

View File

@ -14,11 +14,12 @@ anyhow = { workspace = true }
chrono = { workspace = true }
const_format = "0.2.33"
dotenvy = { workspace = true }
futures = "0.3.31"
futures = { workspace = true }
itertools = { workspace = true }
num-bigint = "0.4.6"
reqwest = { version = "0.12.9", default-features = false, features = ["charset", "rustls-tls", "http2"] }
scraper = "0.23.1"
scraper = "0.25.0"
shared = { path = "../shared" }
sqlx = { workspace = true, features = ["runtime-tokio-rustls", "postgres", "migrate", "chrono", "uuid", "bigdecimal"] }
strum = { workspace = true, features = ["derive"] }
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }

View File

@ -1,24 +1,14 @@
use std::str::FromStr;
use const_format::concatcp;
use strum::EnumIter;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, EnumIter, Hash)]
pub enum Canteen {
Forum,
Academica,
Picknick,
BonaVista,
GrillCafe,
ZM2,
Basilica,
Atrium,
}
use shared::Canteen;
const POST_URL_BASE: &str = "https://www.studierendenwerk-pb.de/gastronomie/speiseplaene/";
impl Canteen {
pub fn get_url(&self) -> &str {
pub trait CanteenExt {
fn get_url(&self) -> &str;
}
impl CanteenExt for Canteen {
fn get_url(&self) -> &str {
match self {
Self::Forum => concatcp!(POST_URL_BASE, "forum/"),
Self::Academica => concatcp!(POST_URL_BASE, "mensa-academica/"),
@ -30,35 +20,4 @@ impl Canteen {
Self::Atrium => concatcp!(POST_URL_BASE, "mensa-atrium-lippstadt/"),
}
}
pub fn get_identifier(&self) -> &str {
match self {
Self::Forum => "forum",
Self::Academica => "academica",
Self::Picknick => "picknick",
Self::BonaVista => "bona-vista",
Self::GrillCafe => "grillcafe",
Self::ZM2 => "zm2",
Self::Basilica => "basilica",
Self::Atrium => "atrium",
}
}
}
impl FromStr for Canteen {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"forum" => Ok(Self::Forum),
"academica" => Ok(Self::Academica),
"picknick" => Ok(Self::Picknick),
"bona-vista" => Ok(Self::BonaVista),
"grillcafe" => Ok(Self::GrillCafe),
"zm2" => Ok(Self::ZM2),
"basilica" => Ok(Self::Basilica),
"atrium" => Ok(Self::Atrium),
invalid => Err(format!("Invalid canteen identifier: {}", invalid)),
}
}
}

View File

@ -1,31 +1,52 @@
use std::fmt::Display;
use std::sync::LazyLock;
use itertools::Itertools;
use scraper::ElementRef;
use num_bigint::BigInt;
use scraper::{ElementRef, Selector};
use shared::DishType;
use sqlx::types::BigDecimal;
static IMG_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".img img").expect("Failed to parse selector"));
static HTML_PRICE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".desc .price").expect("Failed to parse selector"));
static HTML_EXTRAS_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".desc .buttons > *").expect("Failed to parse selector"));
static HTML_NUTRITIONS_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse(".nutritions > p").expect("Failed to parse selector"));
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Dish {
name: String,
image_src: Option<String>,
price_students: Option<String>,
price_employees: Option<String>,
price_guests: Option<String>,
price_students: BigDecimal,
price_employees: BigDecimal,
price_guests: BigDecimal,
extras: Vec<String>,
dish_type: DishType,
pub nutrition_values: NutritionValues,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct NutritionValues {
pub kjoule: Option<i64>,
pub protein: Option<BigDecimal>,
pub carbs: Option<BigDecimal>,
pub fat: Option<BigDecimal>,
}
impl Dish {
pub fn get_name(&self) -> &str {
&self.name
}
pub fn get_price_students(&self) -> Option<&str> {
self.price_students.as_deref()
pub fn get_price_students(&self) -> &BigDecimal {
&self.price_students
}
pub fn get_price_employees(&self) -> Option<&str> {
self.price_employees.as_deref()
pub fn get_price_employees(&self) -> &BigDecimal {
&self.price_employees
}
pub fn get_price_guests(&self) -> Option<&str> {
self.price_guests.as_deref()
pub fn get_price_guests(&self) -> &BigDecimal {
&self.price_guests
}
pub fn get_image_src(&self) -> Option<&str> {
self.image_src.as_deref()
@ -52,8 +73,12 @@ impl Dish {
== self.extras.iter().sorted().collect_vec()
}
pub fn from_element(element: ElementRef, dish_type: DishType) -> Option<Self> {
let html_name_selector = scraper::Selector::parse(".desc h4").ok()?;
pub fn from_element(
element: ElementRef,
details: ElementRef,
dish_type: DishType,
) -> Option<Self> {
let html_name_selector = Selector::parse(".desc h4").ok()?;
let name = element
.select(&html_name_selector)
.next()?
@ -63,16 +88,14 @@ impl Dish {
.trim()
.to_string();
let img_selector = scraper::Selector::parse(".img img").ok()?;
let img_src = element.select(&img_selector).next().and_then(|el| {
let img_src = element.select(&IMG_SELECTOR).next().and_then(|el| {
el.value()
.attr("src")
.map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path))
});
let html_price_selector = scraper::Selector::parse(".desc .price").ok()?;
let mut prices = element
.select(&html_price_selector)
.select(&HTML_PRICE_SELECTOR)
.filter_map(|price| {
let price_for = price.first_child().and_then(|strong| {
strong.first_child().and_then(|text_element| {
@ -93,29 +116,64 @@ impl Dish {
})
.collect::<Vec<_>>();
let html_extras_selector = scraper::Selector::parse(".desc .buttons > *").ok()?;
let extras = element
.select(&html_extras_selector)
.select(&HTML_EXTRAS_SELECTOR)
.filter_map(|extra| extra.value().attr("title").map(|title| title.to_string()))
.collect::<Vec<_>>();
let nutritions_element = details.select(&HTML_NUTRITIONS_SELECTOR).next();
let nutrition_values = if let Some(nutritions_element) = nutritions_element {
let mut kjoule = None;
let mut protein = None;
let mut carbs = None;
let mut fat = None;
for s in nutritions_element.text() {
let s = s.trim();
if !s.is_empty() {
if let Some(rest) = s.strip_prefix("Brennwert = ") {
kjoule = rest
.split_whitespace()
.next()
.and_then(|num_str| num_str.parse().ok());
} else if let Some(rest) = s.strip_prefix("Eiweiß = ") {
protein = grams_to_bigdecimal(rest);
} else if let Some(rest) = s.strip_prefix("Kohlenhydrate = ") {
carbs = grams_to_bigdecimal(rest);
} else if let Some(rest) = s.strip_prefix("Fett = ") {
fat = grams_to_bigdecimal(rest);
}
}
}
NutritionValues {
kjoule,
protein,
carbs,
fat,
}
} else {
NutritionValues::default()
};
Some(Self {
name,
image_src: img_src,
price_students: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Studierende")
.map(|(_, price)| std::mem::take(price)),
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
price_employees: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Bedienstete")
.map(|(_, price)| std::mem::take(price)),
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
price_guests: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Gäste")
.map(|(_, price)| std::mem::take(price)),
.map(|(_, price)| price_to_bigdecimal(Some(price)))?,
extras,
dish_type,
nutrition_values,
})
}
}
@ -126,20 +184,15 @@ impl PartialOrd for Dish {
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DishType {
Main,
Side,
Dessert,
fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
s.and_then(|p| p.trim_end_matches("").replace(',', ".").parse().ok())
.unwrap_or_else(|| BigDecimal::from_bigint(BigInt::from(99999), 2))
}
impl Display for DishType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
Self::Main => "main",
Self::Side => "side",
Self::Dessert => "dessert",
};
f.write_str(s)
}
fn grams_to_bigdecimal(s: &str) -> Option<BigDecimal> {
s.trim_end_matches("g")
.replace(',', ".")
.trim()
.parse()
.ok()
}

View File

@ -1,12 +1,15 @@
mod canteen;
mod dish;
mod menu;
mod refresh;
pub mod util;
use std::{error::Error, fmt::Display};
pub use canteen::Canteen;
pub use dish::Dish;
pub use menu::scrape_menu;
pub use refresh::check_refresh;
pub use util::scrape_canteens_at_days;
#[derive(Debug, Clone)]
struct CustomError(String);

View File

@ -3,8 +3,9 @@ use std::{collections::HashSet, env};
use anyhow::Result;
use chrono::{Duration, Utc};
use itertools::Itertools as _;
use mensa_upb_scraper::{util, Canteen};
use strum::IntoEnumIterator;
use mensa_upb_scraper::util;
use shared::Canteen;
use strum::IntoEnumIterator as _;
#[tokio::main]
async fn main() -> Result<()> {
@ -22,7 +23,7 @@ async fn main() -> Result<()> {
let end_date = (Utc::now() + Duration::days(6)).date_naive();
let already_scraped = sqlx::query!(
"SELECT DISTINCT date, canteen FROM MEALS WHERE date >= $1 AND date <= $2",
"SELECT DISTINCT scraped_for, canteen FROM canteens_scraped WHERE scraped_for >= $1 AND scraped_for <= $2",
start_date,
end_date
)
@ -31,7 +32,7 @@ async fn main() -> Result<()> {
.into_iter()
.map(|r| {
(
r.date,
r.scraped_for,
r.canteen.parse::<Canteen>().expect("Invalid db entry"),
)
})
@ -46,18 +47,15 @@ async fn main() -> Result<()> {
})
.unwrap_or_default();
let date_canteen_combinations = (0..7)
let date_canteen_combinations = (0..1)
.map(|d| (Utc::now() + Duration::days(d)).date_naive())
.cartesian_product(Canteen::iter())
.filter(|entry| !filter_canteens.contains(&entry.1) && !already_scraped.contains(entry))
.filter(|entry @ (_, canteen)| {
!filter_canteens.contains(canteen) && !already_scraped.contains(entry)
})
.collect::<Vec<_>>();
util::async_for_each(&date_canteen_combinations, |(date, canteen, menu)| {
let db = db.clone();
async move {
util::add_menu_to_db(&db, &date, canteen, menu).await;
}
})
.await;
util::scrape_canteens_at_days(&db, &date_canteen_combinations).await?;
tracing::info!("Finished scraping menu");

View File

@ -1,7 +1,21 @@
use std::sync::LazyLock;
use anyhow::Result;
use chrono::NaiveDate;
use scraper::{Html, Selector};
use shared::{Canteen, DishType};
use crate::{dish::DishType, Canteen, CustomError, Dish};
use crate::{canteen::CanteenExt as _, CustomError, Dish};
static HTML_MAIN_DISHES_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table.table-dishes.main-dishes > tbody").expect("Failed to parse selector")
});
static HTML_SIDE_DISHES_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table.table-dishes.side-dishes > tbody").expect("Failed to parse selector")
});
static HTML_DESSERTS_TBODY_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("table.table-dishes.soups > tbody").expect("Failed to parse selector")
});
#[tracing::instrument]
pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>> {
@ -18,39 +32,41 @@ pub async fn scrape_menu(date: &NaiveDate, canteen: Canteen) -> Result<Vec<Dish>
let document = scraper::Html::parse_document(&html_content);
let html_main_dishes_selector = scraper::Selector::parse(
"table.table-dishes.main-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_main_dishes = document.select(&html_main_dishes_selector);
let main_dishes = html_main_dishes
.filter_map(|dish| Dish::from_element(dish, DishType::Main))
.collect::<Vec<_>>();
let html_side_dishes_selector = scraper::Selector::parse(
"table.table-dishes.side-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_side_dishes = document.select(&html_side_dishes_selector);
let side_dishes = html_side_dishes
.filter_map(|dish| Dish::from_element(dish, DishType::Side))
.collect::<Vec<_>>();
let html_desserts_selector = scraper::Selector::parse(
"table.table-dishes.soups > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_desserts = document.select(&html_desserts_selector);
let desserts = html_desserts
.filter_map(|dish| Dish::from_element(dish, DishType::Dessert))
.collect::<Vec<_>>();
let main_dishes = scrape_category(&document, &HTML_MAIN_DISHES_TBODY_SELECTOR, DishType::Main)?;
let side_dishes = scrape_category(&document, &HTML_SIDE_DISHES_TBODY_SELECTOR, DishType::Side)?;
let desserts = scrape_category(&document, &HTML_DESSERTS_TBODY_SELECTOR, DishType::Dessert)?;
let mut res = Vec::new();
res.extend(main_dishes);
res.extend(side_dishes);
res.extend(desserts);
dbg!(&res);
tracing::debug!("Finished scraping");
Ok(res)
}
static ITEM_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("tr.odd > td.description > div.row").expect("Failed to parse selector")
});
static ITEM_DETAILS_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("tr.even > td.more > div.ingredients-list").expect("Failed to parse selector")
});
fn scrape_category<'a>(
document: &'a Html,
tbody_selector: &Selector,
dish_type: DishType,
) -> Result<impl Iterator<Item = Dish> + 'a> {
let tbody = document.select(tbody_selector).next().ok_or_else(|| {
CustomError::from(format!("No tbody found for selector: {:?}", tbody_selector))
})?;
let dishes = tbody.select(&ITEM_SELECTOR);
let dish_details = tbody.select(&ITEM_DETAILS_SELECTOR);
Ok(dishes
.zip(dish_details)
.filter_map(move |(dish, details)| Dish::from_element(dish, details, dish_type)))
}

63
scraper/src/refresh.rs Normal file
View File

@ -0,0 +1,63 @@
use std::{collections::BTreeSet, str::FromStr};
use chrono::{NaiveDate, Utc};
use shared::Canteen;
use crate::util;
pub async fn check_refresh(db: &sqlx::PgPool, date: NaiveDate, canteens: &[Canteen]) -> bool {
let canteens_needing_refresh = match sqlx::query!(
r#"SELECT canteen, max(scraped_at) AS "scraped_at!" FROM canteens_scraped WHERE canteen = ANY($1) AND scraped_for = $2 GROUP BY canteen"#,
&canteens
.iter()
.map(|c| c.get_identifier().to_string())
.collect::<Vec<_>>(),
date
)
.fetch_all(db)
.await
{
Ok(v) => v.iter().filter_map(|r| if needs_refresh(r.scraped_at, date) { Some(Canteen::from_str(&r.canteen).expect("malformed db canteen entry")) } else { None }).collect::<BTreeSet<_>>(),
Err(err) => {
tracing::error!("Error checking for existing scrapes: {}", err);
return false;
}
};
if canteens_needing_refresh.is_empty() {
false
} else {
tracing::debug!(
"Refreshing menu for date {} for canteens: {:?}",
date,
canteens_needing_refresh
);
if let Err(err) = util::scrape_canteens_at_days(
db,
&canteens_needing_refresh
.iter()
.map(|c| (date, *c))
.collect::<Vec<_>>(),
)
.await
{
tracing::error!("Error during refresh scrape: {}", err);
return false;
}
true
}
}
fn needs_refresh(last_refreshed: chrono::DateTime<Utc>, date_entry: chrono::NaiveDate) -> bool {
let now = Utc::now();
if date_entry == now.naive_local().date() {
now.signed_duration_since(last_refreshed) >= chrono::Duration::hours(8)
} else if date_entry < now.naive_local().date() {
false
} else {
now.signed_duration_since(last_refreshed) >= chrono::Duration::days(2)
}
}

View File

@ -1,64 +1,109 @@
use std::{env, future::Future};
use std::env;
use anyhow::Result;
use chrono::NaiveDate;
use futures::StreamExt as _;
use num_bigint::BigInt;
use sqlx::{postgres::PgPoolOptions, types::BigDecimal, PgPool};
use shared::{Canteen, DishType};
use sqlx::{postgres::PgPoolOptions, PgPool, PgTransaction};
use crate::{menu::scrape_menu, Canteen, Dish};
pub async fn async_for_each<F, Fut>(date_canteen_combinations: &[(NaiveDate, Canteen)], f: F)
where
F: FnMut((NaiveDate, Canteen, Vec<Dish>)) -> Fut,
Fut: Future<Output = ()>,
{
futures::stream::iter(date_canteen_combinations)
.then(|(date, canteen)| async move { (*date, *canteen, scrape_menu(date, *canteen).await) })
.filter_map(|(date, canteen, menu)| async move { menu.ok().map(|menu| (date, canteen, menu)) })
.for_each(f)
.await;
}
use crate::{scrape_menu, Dish};
pub fn get_db() -> Result<PgPool> {
Ok(PgPoolOptions::new()
.connect_lazy(&env::var("DATABASE_URL").expect("missing DATABASE_URL env variable"))?)
}
#[tracing::instrument(skip(db))]
pub async fn add_meal_to_db(db: &PgPool, date: &NaiveDate, canteen: Canteen, dish: &Dish) -> Result<()> {
let vegan = dish.is_vegan();
pub async fn scrape_canteens_at_days(
db: &PgPool,
date_canteen_combinations: &[(NaiveDate, Canteen)],
) -> Result<()> {
let (tx, mut rx) = tokio::sync::mpsc::channel::<(NaiveDate, Canteen, Vec<Dish>)>(128);
let mut transaction = db.begin().await?;
for (date, canteen) in date_canteen_combinations {
sqlx::query!(
"UPDATE meals SET is_latest = FALSE WHERE date = $1 AND canteen = $2 AND is_latest = TRUE",
date,
canteen.get_identifier()
)
.execute(&mut *transaction)
.await
.ok();
}
let insert_handle = tokio::spawn(async move {
while let Some((date, canteen, menu)) = rx.recv().await {
add_menu_to_db(&mut transaction, &date, canteen, menu).await?;
}
transaction.commit().await
});
futures::stream::iter(date_canteen_combinations)
.then(|(date, canteen)| async move { (*date, *canteen, scrape_menu(date, *canteen).await) })
.filter_map(
|(date, canteen, menu)| async move { menu.ok().map(|menu| (date, canteen, menu)) },
)
.for_each(|(date, canteen, menu)| {
let tx = tx.clone();
async move {
tx.send((date, canteen, menu)).await.ok();
}
})
.await;
drop(tx);
insert_handle.await??;
Ok(())
}
pub async fn add_menu_to_db(
db: &mut PgTransaction<'_>,
date: &NaiveDate,
canteen: Canteen,
menu: Vec<Dish>,
) -> Result<(), sqlx::Error> {
if menu.is_empty() {
return Ok(());
}
let mut query = sqlx::QueryBuilder::new("INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian,kjoules,proteins,carbohydrates,fats) ");
query
.push_values(menu, |mut sep, item| {
let vegan = item.is_vegan();
sep.push_bind(date)
.push_bind(canteen.get_identifier())
.push_bind(item.get_name().to_string())
.push_bind(item.get_type() as DishType)
.push_bind(item.get_image_src().map(str::to_string))
.push_bind(item.get_price_students().to_owned())
.push_bind(item.get_price_employees().to_owned())
.push_bind(item.get_price_guests().to_owned())
.push_bind(vegan)
.push_bind(vegan || item.is_vegetarian())
.push_bind(item.nutrition_values.kjoule)
.push_bind(item.nutrition_values.protein.to_owned())
.push_bind(item.nutrition_values.carbs.to_owned())
.push_bind(item.nutrition_values.fat.to_owned());
})
.build()
.execute(&mut **db)
.await?;
sqlx::query!(
"INSERT INTO meals (date,canteen,name,dish_type,image_src,price_students,price_employees,price_guests,vegan,vegetarian)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
ON CONFLICT (date,canteen,name) DO NOTHING",
date, canteen.get_identifier(), dish.get_name(),
dish.get_type().to_string(), dish.get_image_src(),
price_to_bigdecimal(dish.get_price_students()),
price_to_bigdecimal(dish.get_price_employees()),
price_to_bigdecimal(dish.get_price_guests()),
vegan, vegan || dish.is_vegetarian()
).execute(db).await.inspect_err(|e| {
tracing::error!("error during database insert: {}", e);
})?;
"INSERT INTO canteens_scraped (scraped_for, canteen) VALUES ($1, $2)",
date,
canteen.get_identifier()
)
.execute(&mut **db)
.await?;
tracing::trace!("Insert to DB successfull");
Ok(())
}
pub async fn add_menu_to_db(db: &PgPool, date: &NaiveDate, canteen: Canteen, menu: Vec<Dish>) {
futures::stream::iter(menu)
.for_each(|dish| async move {
if !dish.get_name().is_empty() {
add_meal_to_db(db, date, canteen, &dish).await.ok();
}
})
.await;
}
pub fn price_to_bigdecimal(s: Option<&str>) -> BigDecimal {
s.and_then(|p| p.trim_end_matches("").replace(',', ".").parse().ok())
.unwrap_or_else(|| BigDecimal::new(BigInt::from(99999), 2))
}

13
shared/Cargo.toml Normal file
View File

@ -0,0 +1,13 @@
[package]
name = "shared"
version = "0.1.0"
edition = "2024"
license.workspace = true
authors.workspace = true
repository.workspace = true
readme.workspace = true
[dependencies]
serde = { workspace = true, features = ["derive"] }
strum = { workspace = true, features = ["derive"] }
sqlx = { workspace = true }

24
shared/src/lib.rs Normal file
View File

@ -0,0 +1,24 @@
use std::fmt::Display;
mod canteen;
pub use canteen::Canteen;
#[derive(Debug, Clone, Copy, PartialEq, Eq, sqlx::Type)]
#[sqlx(type_name = "dish_type_enum")]
#[sqlx(rename_all = "lowercase")]
pub enum DishType {
Main,
Side,
Dessert,
}
impl Display for DishType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
Self::Main => "main",
Self::Side => "side",
Self::Dessert => "dessert",
};
f.write_str(s)
}
}

View File

@ -10,18 +10,20 @@ edition = "2021"
publish = false
[dependencies]
actix-cors = "0.7.0"
actix-governor = { version = "0.8.0", features = ["log"] }
actix-web = "4.9.0"
actix-cors = "0.7.1"
actix-governor = { version = "0.10.0", features = ["log"] }
actix-web = "4.12.1"
anyhow = { workspace = true }
bigdecimal = { version = "0.4.6", features = ["serde"] }
bigdecimal = { version = "0.4.9", features = ["serde"] }
chrono = { workspace = true, features = ["serde"] }
dotenvy = { workspace = true }
itertools = { workspace = true }
serde = { version = "1.0.215", features = ["derive"] }
serde_json = "1.0.133"
mensa-upb-scraper = { path = "../scraper" }
serde = { workspace = true, features = ["derive"] }
serde_json = "1.0.145"
shared = { path = "../shared" }
sqlx = { workspace = true, features = ["runtime-tokio-rustls", "postgres", "migrate", "chrono", "uuid", "bigdecimal"] }
strum = { workspace = true, features = ["derive"] }
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
tracing = "0.1.40"
tracing = "0.1.43"
tracing-subscriber = { workspace = true, features = ["env-filter"] }

View File

@ -1,7 +1,6 @@
use bigdecimal::BigDecimal;
use serde::{Deserialize, Serialize};
use crate::Canteen;
use shared::Canteen;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Dish {

View File

@ -5,13 +5,16 @@ use chrono::NaiveDate;
use itertools::Itertools as _;
use serde::{Deserialize, Serialize};
use serde_json::json;
use shared::Canteen;
use sqlx::PgPool;
use crate::{Canteen, Menu};
use crate::Menu;
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
struct MenuQuery {
date: Option<NaiveDate>,
no_update: Option<bool>,
}
#[get("/menu/{canteen}")]
async fn menu(
@ -31,7 +34,7 @@ async fn menu(
.date
.unwrap_or_else(|| chrono::Local::now().date_naive());
let menu = Menu::query(&db, date, &canteens).await;
let menu = Menu::query(&db, date, &canteens, !query.no_update.unwrap_or_default()).await;
if let Ok(menu) = menu {
HttpResponse::Ok().json(menu)

View File

@ -1,10 +1,9 @@
use actix_web::{get, web::ServiceConfig, HttpResponse, Responder};
use itertools::Itertools as _;
use serde_json::json;
use shared::Canteen;
use strum::IntoEnumIterator as _;
use crate::Canteen;
mod menu;
pub fn configure(cfg: &mut ServiceConfig) {

View File

@ -1,12 +1,10 @@
mod canteen;
mod dish;
pub mod endpoints;
mod governor;
mod menu;
use std::{error::Error, fmt::Display, sync::LazyLock};
use std::sync::LazyLock;
pub use canteen::Canteen;
pub use dish::{Dish, DishPrices};
pub use governor::get_governor;
pub use menu::Menu;
@ -16,26 +14,3 @@ pub(crate) static USE_X_FORWARDED_HOST: LazyLock<bool> = LazyLock::new(|| {
.map(|val| val == "true")
.unwrap_or(false)
});
#[derive(Debug, Clone)]
struct CustomError(String);
impl Display for CustomError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl Error for CustomError {}
impl From<&str> for CustomError {
fn from(s: &str) -> Self {
CustomError(s.to_string())
}
}
impl From<String> for CustomError {
fn from(s: String) -> Self {
CustomError(s)
}
}

View File

@ -1,10 +1,11 @@
use chrono::NaiveDate;
use mensa_upb_scraper::check_refresh;
use serde::{Deserialize, Serialize};
use shared::{Canteen, DishType};
use sqlx::PgPool;
use std::str::FromStr as _;
use chrono::NaiveDate;
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use crate::{Canteen, Dish, DishPrices};
use crate::{Dish, DishPrices};
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Menu {
@ -15,18 +16,32 @@ pub struct Menu {
}
impl Menu {
pub async fn query(db: &PgPool, date: NaiveDate, canteens: &[Canteen]) -> sqlx::Result<Self> {
let canteens = canteens
pub async fn query(
db: &PgPool,
date: NaiveDate,
canteens: &[Canteen],
allow_refresh: bool,
) -> sqlx::Result<Self> {
let canteens_str = canteens
.iter()
.map(|c| c.get_identifier().to_string())
.collect::<Vec<_>>();
let result = sqlx::query!("SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS canteens, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
FROM meals WHERE date = $1 AND canteen = ANY($2)
let query_db = async || {
sqlx::query!(r#"SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS "canteens!", dish_type AS "dish_type: DishType", image_src, price_students, price_employees, price_guests, vegan, vegetarian
FROM meals WHERE date = $1 AND canteen = ANY($2) AND is_latest = TRUE
GROUP BY name, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
ORDER BY name",
date, &canteens)
ORDER BY name"#,
date, &canteens_str)
.fetch_all(db)
.await?;
.await
};
let mut result = query_db().await?;
if allow_refresh && check_refresh(db, date, canteens).await {
result = query_db().await?;
}
let mut main_dishes = Vec::new();
let mut side_dishes = Vec::new();
@ -36,12 +51,11 @@ impl Menu {
let dish = Dish {
name: row.name,
image_src: row.image_src,
canteens: row.canteens.map_or_else(Vec::new, |canteens| {
canteens
.iter()
.map(|canteen| Canteen::from_str(canteen).expect("Invalid database entry"))
.collect()
}),
canteens: row
.canteens
.iter()
.map(|canteen| Canteen::from_str(canteen).expect("Invalid database entry"))
.collect(),
vegan: row.vegan,
vegetarian: row.vegetarian,
price: DishPrices {
@ -50,11 +64,11 @@ impl Menu {
guests: row.price_guests.with_prec(5).with_scale(2),
},
};
if row.dish_type == "main" {
if row.dish_type == DishType::Main {
main_dishes.push(dish);
} else if row.dish_type == "side" {
} else if row.dish_type == DishType::Side {
side_dishes.push(dish);
} else if row.dish_type == "dessert" {
} else if row.dish_type == DishType::Dessert {
desserts.push(dish);
}
}