change to use postgres db instead of scraping

This commit is contained in:
Moritz Hölting 2024-11-20 20:11:00 +01:00
parent 12d3f58832
commit bc88064c82
14 changed files with 1357 additions and 1283 deletions

2121
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,24 +6,22 @@ authors = ["Moritz Hölting"]
repository = "https://github.com/moritz-hoelting/mensa-upb-api" repository = "https://github.com/moritz-hoelting/mensa-upb-api"
publish = false publish = false
readme = "README.md" readme = "README.md"
version = "0.1.1" version = "0.2.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
actix-cors = "0.7.0" actix-cors = "0.7.0"
actix-governor = { version = "0.5.0", features = ["log"] } actix-governor = { version = "0.7.0", features = ["log"] }
actix-web = "4.8.0" actix-web = "4.9.0"
anyhow = "1.0.86" anyhow = "1.0.93"
chrono = "0.4.38" bigdecimal = { version = "0.4.6", features = ["serde"] }
const_format = "0.2.32" chrono = { version = "0.4.38", features = ["serde"] }
dotenvy = "0.15.7" dotenvy = "0.15.7"
futures = "0.3.30"
itertools = "0.13.0" itertools = "0.13.0"
reqwest = "0.12.5" serde = { version = "1.0.215", features = ["derive"] }
scraper = "0.19.0" serde_json = "1.0.133"
serde = { version = "1.0.203", features = ["derive"] }
serde_json = "1.0.120"
strum = { version = "0.26.3", features = ["derive"] } strum = { version = "0.26.3", features = ["derive"] }
tokio = { version = "1.38.0", features = ["full"] } sqlx = { version = "0.8.2", features = ["runtime-tokio-rustls", "postgres", "migrate", "chrono", "uuid", "bigdecimal"] }
tokio = { version = "1.41.1", features = ["macros", "rt-multi-thread"] }
tracing = "0.1.40" tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }

View File

@ -0,0 +1,3 @@
-- Add down migration script here
DROP TABLE meals;

View File

@ -0,0 +1,15 @@
-- Add up migration script here
CREATE TABLE IF NOT EXISTS meals(
date DATE NOT NULL,
canteen TEXT NOT NULL,
name TEXT NOT NULL,
dish_type TEXT NOT NULL,
image_src TEXT,
price_students DECIMAL(5, 2) NOT NULL,
price_employees DECIMAL(5, 2) NOT NULL,
price_guests DECIMAL(5, 2) NOT NULL,
vegan BOOLEAN DEFAULT FALSE,
vegetarian BOOLEAN DEFAULT FALSE,
PRIMARY KEY (date, canteen, name)
);

View File

@ -0,0 +1,4 @@
-- Add down migration script here
ALTER TABLE meals ALTER COLUMN vegan DROP NOT NULL;
ALTER TABLE meals ALTER COLUMN vegetarian DROP NOT NULL;

View File

@ -0,0 +1,11 @@
-- Add up migration script here
ALTER TABLE meals
ALTER COLUMN vegan TYPE BOOLEAN USING (COALESCE(vegan, FALSE)),
ALTER COLUMN vegan SET DEFAULT FALSE,
ALTER COLUMN vegan SET NOT NULL;
ALTER TABLE meals
ALTER COLUMN vegetarian TYPE BOOLEAN USING (COALESCE(vegetarian, FALSE)),
ALTER COLUMN vegetarian SET DEFAULT FALSE,
ALTER COLUMN vegetarian SET NOT NULL

View File

@ -1,64 +0,0 @@
use std::{collections::HashMap, sync::Arc};
use chrono::{NaiveDate, Utc};
use futures::StreamExt;
use itertools::Itertools;
use tokio::sync::RwLock;
use tracing::{debug, instrument};
use crate::{Canteen, Menu};
#[derive(Debug, Clone, Default)]
pub struct MenuCache {
cache: Arc<RwLock<HashMap<(NaiveDate, Canteen), Menu>>>,
}
impl MenuCache {
pub async fn get_combined(&self, canteens: &[Canteen], date: NaiveDate) -> Menu {
futures::stream::iter(canteens)
.then(|canteen| async move { self.get(*canteen, date).await })
.filter_map(|c| async { c })
.fold(Menu::default(), |a, b| async move { a.merged(b) })
.await
}
#[instrument(skip(self))]
pub async fn get(&self, canteen: Canteen, date: NaiveDate) -> Option<Menu> {
let query = (date, canteen);
let (is_in_cache, is_cache_too_large) = {
let cache = self.cache.read().await;
(cache.contains_key(&query), cache.len() > 100)
};
if is_cache_too_large {
self.clean_outdated().await;
}
if is_in_cache {
let cache = self.cache.read().await;
Some(cache.get(&query)?.clone())
} else {
debug!("Not in cache, fetching from network");
let menu = Menu::new(date, canteen).await.ok()?;
self.cache.write().await.insert(query, menu.clone());
Some(menu)
}
}
pub async fn clean_outdated(&self) {
let today = Utc::now().date_naive();
let outdated_keys = self
.cache
.read()
.await
.keys()
.map(|x| x.to_owned())
.filter(|(date, _)| date < &today)
.collect_vec();
let mut cache = self.cache.write().await;
for key in outdated_keys {
cache.remove(&key);
}
}
}

View File

@ -1,6 +1,5 @@
use std::str::FromStr; use std::str::FromStr;
use const_format::concatcp;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use strum::EnumIter; use strum::EnumIter;
@ -18,22 +17,7 @@ pub enum Canteen {
Atrium, Atrium,
} }
const POST_URL_BASE: &str = "https://www.studierendenwerk-pb.de/gastronomie/speiseplaene/";
impl Canteen { impl Canteen {
pub fn get_url(&self) -> &str {
match self {
Self::Forum => concatcp!(POST_URL_BASE, "forum/"),
Self::Academica => concatcp!(POST_URL_BASE, "mensa-academica/"),
Self::Picknick => concatcp!(POST_URL_BASE, "picknick/"),
Self::BonaVista => concatcp!(POST_URL_BASE, "bona-vista/"),
Self::GrillCafe => concatcp!(POST_URL_BASE, "grillcafe/"),
Self::ZM2 => concatcp!(POST_URL_BASE, "mensa-zm2/"),
Self::Basilica => concatcp!(POST_URL_BASE, "mensa-basilica-hamm/"),
Self::Atrium => concatcp!(POST_URL_BASE, "mensa-atrium-lippstadt/"),
}
}
pub fn get_identifier(&self) -> &str { pub fn get_identifier(&self) -> &str {
match self { match self {
Self::Forum => "forum", Self::Forum => "forum",

View File

@ -1,47 +1,31 @@
use itertools::Itertools; use bigdecimal::BigDecimal;
use scraper::ElementRef;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::Canteen; use crate::Canteen;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Dish { pub struct Dish {
name: String, pub name: String,
image_src: Option<String>, pub image_src: Option<String>,
price_students: Option<String>, pub price: DishPrices,
price_employees: Option<String>, pub vegetarian: bool,
price_guests: Option<String>, pub vegan: bool,
extras: Vec<String>, pub canteens: Vec<Canteen>,
canteens: Vec<Canteen>, }
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DishPrices {
pub students: BigDecimal,
pub employees: BigDecimal,
pub guests: BigDecimal,
} }
impl Dish { impl Dish {
pub fn get_name(&self) -> &str {
&self.name
}
pub fn get_price_students(&self) -> Option<&str> {
self.price_students.as_deref()
}
pub fn get_price_employees(&self) -> Option<&str> {
self.price_employees.as_deref()
}
pub fn get_price_guests(&self) -> Option<&str> {
self.price_guests.as_deref()
}
pub fn get_extras(&self) -> &[String] {
&self.extras
}
pub fn get_canteens(&self) -> &[Canteen] {
&self.canteens
}
pub fn same_as(&self, other: &Self) -> bool { pub fn same_as(&self, other: &Self) -> bool {
self.name == other.name self.name == other.name
&& self.price_employees == other.price_employees && self.price == other.price
&& self.price_guests == other.price_guests && self.vegan == other.vegan
&& self.price_students == other.price_students && self.vegetarian == other.vegetarian
&& self.extras.iter().sorted().collect_vec()
== self.extras.iter().sorted().collect_vec()
} }
pub fn merge(&mut self, other: Self) { pub fn merge(&mut self, other: Self) {
@ -51,75 +35,6 @@ impl Dish {
} }
} }
impl Dish {
pub fn from_element(element: ElementRef, canteen: Canteen) -> Option<Self> {
let html_name_selector = scraper::Selector::parse(".desc h4").ok()?;
let name = element
.select(&html_name_selector)
.next()?
.text()
.collect::<Vec<_>>()
.join("")
.trim()
.to_string();
let img_selector = scraper::Selector::parse(".img img").ok()?;
let img_src = element.select(&img_selector).next().and_then(|el| {
el.value()
.attr("src")
.map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path))
});
let html_price_selector = scraper::Selector::parse(".desc .price").ok()?;
let mut prices = element
.select(&html_price_selector)
.filter_map(|price| {
let price_for = price.first_child().and_then(|strong| {
strong.first_child().and_then(|text_element| {
text_element
.value()
.as_text()
.map(|text| text.trim().trim_end_matches(':').to_string())
})
});
let price_value = price.last_child().and_then(|text_element| {
text_element
.value()
.as_text()
.map(|text| text.trim().to_string())
});
price_for
.and_then(|price_for| price_value.map(|price_value| (price_for, price_value)))
})
.collect::<Vec<_>>();
let html_extras_selector = scraper::Selector::parse(".desc .buttons > *").ok()?;
let extras = element
.select(&html_extras_selector)
.filter_map(|extra| extra.value().attr("title").map(|title| title.to_string()))
.collect::<Vec<_>>();
Some(Self {
name,
image_src: img_src,
price_students: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Studierende")
.map(|(_, price)| std::mem::take(price)),
price_employees: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Bedienstete")
.map(|(_, price)| std::mem::take(price)),
price_guests: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Gäste")
.map(|(_, price)| std::mem::take(price)),
extras,
canteens: vec![canteen],
})
}
}
impl PartialOrd for Dish { impl PartialOrd for Dish {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.name.partial_cmp(&other.name) self.name.partial_cmp(&other.name)

49
src/endpoints/menu.rs Normal file
View File

@ -0,0 +1,49 @@
use std::str::FromStr as _;
use actix_web::{get, web, HttpResponse, Responder};
use chrono::NaiveDate;
use itertools::Itertools as _;
use serde::{Deserialize, Serialize};
use serde_json::json;
use sqlx::PgPool;
use crate::{Canteen, Menu};
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
struct MenuQuery {
date: Option<NaiveDate>,
}
#[get("/menu/{canteen}")]
async fn menu(
path: web::Path<String>,
query: web::Query<MenuQuery>,
db: web::Data<PgPool>,
) -> impl Responder {
let canteens = path
.into_inner()
.split(',')
.map(Canteen::from_str)
.collect_vec();
if canteens.iter().all(Result::is_ok) {
let canteens = canteens.into_iter().filter_map(Result::ok).collect_vec();
let date = query
.date
.unwrap_or_else(|| chrono::Local::now().date_naive());
let menu = Menu::query(&db, date, &canteens).await;
if let Ok(menu) = menu {
HttpResponse::Ok().json(menu)
} else {
HttpResponse::InternalServerError().json(json!({
"error": "Failed to query database",
}))
}
} else {
HttpResponse::BadRequest().json(json!({
"error": "Invalid canteen identifier",
"invalid": canteens.into_iter().filter_map(|c| c.err()).collect_vec()
}))
}
}

22
src/endpoints/mod.rs Normal file
View File

@ -0,0 +1,22 @@
use actix_web::{get, web::ServiceConfig, HttpResponse, Responder};
use itertools::Itertools as _;
use serde_json::json;
use strum::IntoEnumIterator as _;
use crate::Canteen;
mod menu;
pub fn configure(cfg: &mut ServiceConfig) {
cfg.service(index);
cfg.service(menu::menu);
}
#[get("/")]
async fn index() -> impl Responder {
HttpResponse::Ok().json(json!({
"version": env!("CARGO_PKG_VERSION"),
"description": env!("CARGO_PKG_DESCRIPTION"),
"supportedCanteens": Canteen::iter().map(|c| c.get_identifier().to_string()).collect_vec(),
}))
}

View File

@ -1,13 +1,12 @@
mod cache;
mod canteen; mod canteen;
mod dish; mod dish;
pub mod endpoints;
mod menu; mod menu;
use std::{error::Error, fmt::Display}; use std::{error::Error, fmt::Display};
pub use cache::MenuCache;
pub use canteen::Canteen; pub use canteen::Canteen;
pub use dish::Dish; pub use dish::{Dish, DishPrices};
pub use menu::Menu; pub use menu::Menu;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]

View File

@ -1,19 +1,16 @@
use std::{env, io, str::FromStr}; use std::env;
use actix_cors::Cors; use actix_cors::Cors;
use actix_governor::{Governor, GovernorConfigBuilder}; use actix_governor::{Governor, GovernorConfigBuilder};
use actix_web::{get, web, App, HttpResponse, HttpServer, Responder}; use actix_web::{web, App, HttpServer};
use chrono::{Duration as CDuration, Utc}; use anyhow::Result;
use itertools::Itertools; use itertools::Itertools;
use mensa_upb_api::{Canteen, MenuCache}; use sqlx::postgres::PgPoolOptions;
use serde::{Deserialize, Serialize};
use serde_json::json;
use strum::IntoEnumIterator;
use tracing::{debug, error, info, level_filters::LevelFilter}; use tracing::{debug, error, info, level_filters::LevelFilter};
use tracing_subscriber::EnvFilter; use tracing_subscriber::EnvFilter;
#[tokio::main] #[tokio::main]
async fn main() -> io::Result<()> { async fn main() -> Result<()> {
let env_filter = EnvFilter::builder() let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::WARN.into()) .with_default_directive(LevelFilter::WARN.into())
.from_env() .from_env()
@ -27,6 +24,9 @@ async fn main() -> io::Result<()> {
Err(_) => {} Err(_) => {}
} }
let db = PgPoolOptions::new()
.connect_lazy(&env::var("DATABASE_URL").expect("missing DATABASE_URL env variable"))?;
let interface = env::var("API_INTERFACE").unwrap_or("127.0.0.1".to_string()); let interface = env::var("API_INTERFACE").unwrap_or("127.0.0.1".to_string());
let port = env::var("API_PORT") let port = env::var("API_PORT")
.ok() .ok()
@ -51,13 +51,11 @@ async fn main() -> io::Result<()> {
.unwrap_or_default(); .unwrap_or_default();
let governor_conf = GovernorConfigBuilder::default() let governor_conf = GovernorConfigBuilder::default()
.per_second(seconds_replenish) .seconds_per_request(seconds_replenish)
.burst_size(burst_size) .burst_size(burst_size)
.finish() .finish()
.unwrap(); .unwrap();
let menu_cache = MenuCache::default();
info!("Starting server on {}:{}", interface, port); info!("Starting server on {}:{}", interface, port);
HttpServer::new(move || { HttpServer::new(move || {
@ -71,62 +69,12 @@ async fn main() -> io::Result<()> {
App::new() App::new()
.wrap(Governor::new(&governor_conf)) .wrap(Governor::new(&governor_conf))
.wrap(cors) .wrap(cors)
.app_data(web::Data::new(menu_cache.clone())) .app_data(web::Data::new(db.clone()))
.service(index) .configure(mensa_upb_api::endpoints::configure)
.service(menu_today)
}) })
.bind((interface.as_str(), port))? .bind((interface.as_str(), port))?
.run() .run()
.await .await?;
}
#[get("/")] Ok(())
async fn index() -> impl Responder {
HttpResponse::Ok().json(json!({
"version": env!("CARGO_PKG_VERSION"),
"description": env!("CARGO_PKG_DESCRIPTION"),
"supportedCanteens": Canteen::iter().map(|c| c.get_identifier().to_string()).collect_vec(),
}))
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
struct MenuQuery {
#[serde(rename = "d")]
days_ahead: Option<String>,
}
#[get("/menu/{canteen}")]
async fn menu_today(
cache: web::Data<MenuCache>,
path: web::Path<String>,
query: web::Query<MenuQuery>,
) -> impl Responder {
let canteens = path
.into_inner()
.split(',')
.map(Canteen::from_str)
.collect_vec();
if canteens.iter().all(Result::is_ok) {
let canteens = canteens.into_iter().filter_map(Result::ok).collect_vec();
let days_ahead = query
.days_ahead
.as_ref()
.map_or(Ok(0), |d| d.parse::<i64>());
if let Ok(days_ahead) = days_ahead {
let date = (Utc::now() + CDuration::days(days_ahead)).date_naive();
let menu = cache.get_combined(&canteens, date).await;
HttpResponse::Ok().json(menu)
} else {
HttpResponse::BadRequest().json(json!({
"error": "Invalid days query"
}))
}
} else {
HttpResponse::BadRequest().json(json!({
"error": "Invalid canteen identifier",
"invalid": canteens.into_iter().filter_map(|c| c.err()).collect_vec()
}))
}
} }

View File

@ -1,19 +1,70 @@
use anyhow::Result; use std::str::FromStr as _;
use chrono::NaiveDate; use chrono::NaiveDate;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use crate::{Canteen, CustomError, Dish}; use crate::{Canteen, Dish, DishPrices};
#[derive(Debug, Clone, Serialize, Deserialize, Default)] #[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Menu { pub struct Menu {
date: NaiveDate,
main_dishes: Vec<Dish>, main_dishes: Vec<Dish>,
side_dishes: Vec<Dish>, side_dishes: Vec<Dish>,
desserts: Vec<Dish>, desserts: Vec<Dish>,
} }
impl Menu { impl Menu {
pub async fn new(day: NaiveDate, canteen: Canteen) -> Result<Self> { pub async fn query(db: &PgPool, date: NaiveDate, canteens: &[Canteen]) -> sqlx::Result<Self> {
scrape_menu(canteen, day).await let canteens = canteens
.iter()
.map(|c| c.get_identifier().to_string())
.collect::<Vec<_>>();
let result = sqlx::query!("SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS canteens, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
FROM meals WHERE date = $1 AND canteen = ANY($2)
GROUP BY name, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
ORDER BY name",
date, &canteens)
.fetch_all(db)
.await?;
let mut main_dishes = Vec::new();
let mut side_dishes = Vec::new();
let mut desserts = Vec::new();
for row in result {
let dish = Dish {
name: row.name,
image_src: row.image_src,
canteens: row.canteens.map_or_else(Vec::new, |canteens| {
canteens
.iter()
.map(|canteen| Canteen::from_str(canteen).expect("Invalid database entry"))
.collect()
}),
vegan: row.vegan,
vegetarian: row.vegetarian,
price: DishPrices {
students: row.price_students.with_prec(5).with_scale(2),
employees: row.price_employees.with_prec(5).with_scale(2),
guests: row.price_guests.with_prec(5).with_scale(2),
},
};
if row.dish_type == "main" {
main_dishes.push(dish);
} else if row.dish_type == "side" {
side_dishes.push(dish);
} else if row.dish_type == "dessert" {
desserts.push(dish);
}
}
Ok(Self {
date,
main_dishes,
side_dishes,
desserts,
})
} }
pub fn get_main_dishes(&self) -> &[Dish] { pub fn get_main_dishes(&self) -> &[Dish] {
@ -55,59 +106,11 @@ impl Menu {
} }
} }
main_dishes.sort_by(|a, b| a.get_name().cmp(b.get_name()));
side_dishes.sort_by(|a, b| a.get_name().cmp(b.get_name()));
desserts.sort_by(|a, b| a.get_name().cmp(b.get_name()));
Self { Self {
date: self.date,
main_dishes, main_dishes,
side_dishes, side_dishes,
desserts, desserts,
} }
} }
} }
async fn scrape_menu(canteen: Canteen, day: NaiveDate) -> Result<Menu> {
let url = canteen.get_url();
let client = reqwest::Client::new();
let request_builder = client
.post(url)
.query(&[("tx_pamensa_mensa[date]", day.format("%Y-%m-%d").to_string())]);
let response = request_builder.send().await?;
let html_content = response.text().await?;
let document = scraper::Html::parse_document(&html_content);
let html_main_dishes_selector = scraper::Selector::parse(
"table.table-dishes.main-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_main_dishes = document.select(&html_main_dishes_selector);
let main_dishes = html_main_dishes
.filter_map(|dish| Dish::from_element(dish, canteen))
.collect::<Vec<_>>();
let html_side_dishes_selector = scraper::Selector::parse(
"table.table-dishes.side-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_side_dishes = document.select(&html_side_dishes_selector);
let side_dishes = html_side_dishes
.filter_map(|dish| Dish::from_element(dish, canteen))
.collect::<Vec<_>>();
let html_desserts_selector = scraper::Selector::parse(
"table.table-dishes.soups > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_desserts = document.select(&html_desserts_selector);
let desserts = html_desserts
.filter_map(|dish| Dish::from_element(dish, canteen))
.collect::<Vec<_>>();
Ok(Menu {
main_dishes,
side_dishes,
desserts,
})
}