change to use postgres db instead of scraping

This commit is contained in:
Moritz Hölting 2024-11-20 20:11:00 +01:00
parent 12d3f58832
commit bc88064c82
14 changed files with 1357 additions and 1283 deletions

2121
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,24 +6,22 @@ authors = ["Moritz Hölting"]
repository = "https://github.com/moritz-hoelting/mensa-upb-api"
publish = false
readme = "README.md"
version = "0.1.1"
version = "0.2.0"
edition = "2021"
[dependencies]
actix-cors = "0.7.0"
actix-governor = { version = "0.5.0", features = ["log"] }
actix-web = "4.8.0"
anyhow = "1.0.86"
chrono = "0.4.38"
const_format = "0.2.32"
actix-governor = { version = "0.7.0", features = ["log"] }
actix-web = "4.9.0"
anyhow = "1.0.93"
bigdecimal = { version = "0.4.6", features = ["serde"] }
chrono = { version = "0.4.38", features = ["serde"] }
dotenvy = "0.15.7"
futures = "0.3.30"
itertools = "0.13.0"
reqwest = "0.12.5"
scraper = "0.19.0"
serde = { version = "1.0.203", features = ["derive"] }
serde_json = "1.0.120"
serde = { version = "1.0.215", features = ["derive"] }
serde_json = "1.0.133"
strum = { version = "0.26.3", features = ["derive"] }
tokio = { version = "1.38.0", features = ["full"] }
sqlx = { version = "0.8.2", features = ["runtime-tokio-rustls", "postgres", "migrate", "chrono", "uuid", "bigdecimal"] }
tokio = { version = "1.41.1", features = ["macros", "rt-multi-thread"] }
tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }

View File

@ -0,0 +1,3 @@
-- Add down migration script here
DROP TABLE meals;

View File

@ -0,0 +1,15 @@
-- Add up migration script here
CREATE TABLE IF NOT EXISTS meals(
date DATE NOT NULL,
canteen TEXT NOT NULL,
name TEXT NOT NULL,
dish_type TEXT NOT NULL,
image_src TEXT,
price_students DECIMAL(5, 2) NOT NULL,
price_employees DECIMAL(5, 2) NOT NULL,
price_guests DECIMAL(5, 2) NOT NULL,
vegan BOOLEAN DEFAULT FALSE,
vegetarian BOOLEAN DEFAULT FALSE,
PRIMARY KEY (date, canteen, name)
);

View File

@ -0,0 +1,4 @@
-- Add down migration script here
ALTER TABLE meals ALTER COLUMN vegan DROP NOT NULL;
ALTER TABLE meals ALTER COLUMN vegetarian DROP NOT NULL;

View File

@ -0,0 +1,11 @@
-- Add up migration script here
ALTER TABLE meals
ALTER COLUMN vegan TYPE BOOLEAN USING (COALESCE(vegan, FALSE)),
ALTER COLUMN vegan SET DEFAULT FALSE,
ALTER COLUMN vegan SET NOT NULL;
ALTER TABLE meals
ALTER COLUMN vegetarian TYPE BOOLEAN USING (COALESCE(vegetarian, FALSE)),
ALTER COLUMN vegetarian SET DEFAULT FALSE,
ALTER COLUMN vegetarian SET NOT NULL

View File

@ -1,64 +0,0 @@
use std::{collections::HashMap, sync::Arc};
use chrono::{NaiveDate, Utc};
use futures::StreamExt;
use itertools::Itertools;
use tokio::sync::RwLock;
use tracing::{debug, instrument};
use crate::{Canteen, Menu};
#[derive(Debug, Clone, Default)]
pub struct MenuCache {
cache: Arc<RwLock<HashMap<(NaiveDate, Canteen), Menu>>>,
}
impl MenuCache {
pub async fn get_combined(&self, canteens: &[Canteen], date: NaiveDate) -> Menu {
futures::stream::iter(canteens)
.then(|canteen| async move { self.get(*canteen, date).await })
.filter_map(|c| async { c })
.fold(Menu::default(), |a, b| async move { a.merged(b) })
.await
}
#[instrument(skip(self))]
pub async fn get(&self, canteen: Canteen, date: NaiveDate) -> Option<Menu> {
let query = (date, canteen);
let (is_in_cache, is_cache_too_large) = {
let cache = self.cache.read().await;
(cache.contains_key(&query), cache.len() > 100)
};
if is_cache_too_large {
self.clean_outdated().await;
}
if is_in_cache {
let cache = self.cache.read().await;
Some(cache.get(&query)?.clone())
} else {
debug!("Not in cache, fetching from network");
let menu = Menu::new(date, canteen).await.ok()?;
self.cache.write().await.insert(query, menu.clone());
Some(menu)
}
}
pub async fn clean_outdated(&self) {
let today = Utc::now().date_naive();
let outdated_keys = self
.cache
.read()
.await
.keys()
.map(|x| x.to_owned())
.filter(|(date, _)| date < &today)
.collect_vec();
let mut cache = self.cache.write().await;
for key in outdated_keys {
cache.remove(&key);
}
}
}

View File

@ -1,6 +1,5 @@
use std::str::FromStr;
use const_format::concatcp;
use serde::{Deserialize, Serialize};
use strum::EnumIter;
@ -18,22 +17,7 @@ pub enum Canteen {
Atrium,
}
const POST_URL_BASE: &str = "https://www.studierendenwerk-pb.de/gastronomie/speiseplaene/";
impl Canteen {
pub fn get_url(&self) -> &str {
match self {
Self::Forum => concatcp!(POST_URL_BASE, "forum/"),
Self::Academica => concatcp!(POST_URL_BASE, "mensa-academica/"),
Self::Picknick => concatcp!(POST_URL_BASE, "picknick/"),
Self::BonaVista => concatcp!(POST_URL_BASE, "bona-vista/"),
Self::GrillCafe => concatcp!(POST_URL_BASE, "grillcafe/"),
Self::ZM2 => concatcp!(POST_URL_BASE, "mensa-zm2/"),
Self::Basilica => concatcp!(POST_URL_BASE, "mensa-basilica-hamm/"),
Self::Atrium => concatcp!(POST_URL_BASE, "mensa-atrium-lippstadt/"),
}
}
pub fn get_identifier(&self) -> &str {
match self {
Self::Forum => "forum",

View File

@ -1,47 +1,31 @@
use itertools::Itertools;
use scraper::ElementRef;
use bigdecimal::BigDecimal;
use serde::{Deserialize, Serialize};
use crate::Canteen;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Dish {
name: String,
image_src: Option<String>,
price_students: Option<String>,
price_employees: Option<String>,
price_guests: Option<String>,
extras: Vec<String>,
canteens: Vec<Canteen>,
pub name: String,
pub image_src: Option<String>,
pub price: DishPrices,
pub vegetarian: bool,
pub vegan: bool,
pub canteens: Vec<Canteen>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DishPrices {
pub students: BigDecimal,
pub employees: BigDecimal,
pub guests: BigDecimal,
}
impl Dish {
pub fn get_name(&self) -> &str {
&self.name
}
pub fn get_price_students(&self) -> Option<&str> {
self.price_students.as_deref()
}
pub fn get_price_employees(&self) -> Option<&str> {
self.price_employees.as_deref()
}
pub fn get_price_guests(&self) -> Option<&str> {
self.price_guests.as_deref()
}
pub fn get_extras(&self) -> &[String] {
&self.extras
}
pub fn get_canteens(&self) -> &[Canteen] {
&self.canteens
}
pub fn same_as(&self, other: &Self) -> bool {
self.name == other.name
&& self.price_employees == other.price_employees
&& self.price_guests == other.price_guests
&& self.price_students == other.price_students
&& self.extras.iter().sorted().collect_vec()
== self.extras.iter().sorted().collect_vec()
&& self.price == other.price
&& self.vegan == other.vegan
&& self.vegetarian == other.vegetarian
}
pub fn merge(&mut self, other: Self) {
@ -51,75 +35,6 @@ impl Dish {
}
}
impl Dish {
pub fn from_element(element: ElementRef, canteen: Canteen) -> Option<Self> {
let html_name_selector = scraper::Selector::parse(".desc h4").ok()?;
let name = element
.select(&html_name_selector)
.next()?
.text()
.collect::<Vec<_>>()
.join("")
.trim()
.to_string();
let img_selector = scraper::Selector::parse(".img img").ok()?;
let img_src = element.select(&img_selector).next().and_then(|el| {
el.value()
.attr("src")
.map(|img_src_path| format!("https://www.studierendenwerk-pb.de/{}", img_src_path))
});
let html_price_selector = scraper::Selector::parse(".desc .price").ok()?;
let mut prices = element
.select(&html_price_selector)
.filter_map(|price| {
let price_for = price.first_child().and_then(|strong| {
strong.first_child().and_then(|text_element| {
text_element
.value()
.as_text()
.map(|text| text.trim().trim_end_matches(':').to_string())
})
});
let price_value = price.last_child().and_then(|text_element| {
text_element
.value()
.as_text()
.map(|text| text.trim().to_string())
});
price_for
.and_then(|price_for| price_value.map(|price_value| (price_for, price_value)))
})
.collect::<Vec<_>>();
let html_extras_selector = scraper::Selector::parse(".desc .buttons > *").ok()?;
let extras = element
.select(&html_extras_selector)
.filter_map(|extra| extra.value().attr("title").map(|title| title.to_string()))
.collect::<Vec<_>>();
Some(Self {
name,
image_src: img_src,
price_students: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Studierende")
.map(|(_, price)| std::mem::take(price)),
price_employees: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Bedienstete")
.map(|(_, price)| std::mem::take(price)),
price_guests: prices
.iter_mut()
.find(|(price_for, _)| price_for == "Gäste")
.map(|(_, price)| std::mem::take(price)),
extras,
canteens: vec![canteen],
})
}
}
impl PartialOrd for Dish {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.name.partial_cmp(&other.name)

49
src/endpoints/menu.rs Normal file
View File

@ -0,0 +1,49 @@
use std::str::FromStr as _;
use actix_web::{get, web, HttpResponse, Responder};
use chrono::NaiveDate;
use itertools::Itertools as _;
use serde::{Deserialize, Serialize};
use serde_json::json;
use sqlx::PgPool;
use crate::{Canteen, Menu};
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
struct MenuQuery {
date: Option<NaiveDate>,
}
#[get("/menu/{canteen}")]
async fn menu(
path: web::Path<String>,
query: web::Query<MenuQuery>,
db: web::Data<PgPool>,
) -> impl Responder {
let canteens = path
.into_inner()
.split(',')
.map(Canteen::from_str)
.collect_vec();
if canteens.iter().all(Result::is_ok) {
let canteens = canteens.into_iter().filter_map(Result::ok).collect_vec();
let date = query
.date
.unwrap_or_else(|| chrono::Local::now().date_naive());
let menu = Menu::query(&db, date, &canteens).await;
if let Ok(menu) = menu {
HttpResponse::Ok().json(menu)
} else {
HttpResponse::InternalServerError().json(json!({
"error": "Failed to query database",
}))
}
} else {
HttpResponse::BadRequest().json(json!({
"error": "Invalid canteen identifier",
"invalid": canteens.into_iter().filter_map(|c| c.err()).collect_vec()
}))
}
}

22
src/endpoints/mod.rs Normal file
View File

@ -0,0 +1,22 @@
use actix_web::{get, web::ServiceConfig, HttpResponse, Responder};
use itertools::Itertools as _;
use serde_json::json;
use strum::IntoEnumIterator as _;
use crate::Canteen;
mod menu;
pub fn configure(cfg: &mut ServiceConfig) {
cfg.service(index);
cfg.service(menu::menu);
}
#[get("/")]
async fn index() -> impl Responder {
HttpResponse::Ok().json(json!({
"version": env!("CARGO_PKG_VERSION"),
"description": env!("CARGO_PKG_DESCRIPTION"),
"supportedCanteens": Canteen::iter().map(|c| c.get_identifier().to_string()).collect_vec(),
}))
}

View File

@ -1,13 +1,12 @@
mod cache;
mod canteen;
mod dish;
pub mod endpoints;
mod menu;
use std::{error::Error, fmt::Display};
pub use cache::MenuCache;
pub use canteen::Canteen;
pub use dish::Dish;
pub use dish::{Dish, DishPrices};
pub use menu::Menu;
#[derive(Debug, Clone)]

View File

@ -1,19 +1,16 @@
use std::{env, io, str::FromStr};
use std::env;
use actix_cors::Cors;
use actix_governor::{Governor, GovernorConfigBuilder};
use actix_web::{get, web, App, HttpResponse, HttpServer, Responder};
use chrono::{Duration as CDuration, Utc};
use actix_web::{web, App, HttpServer};
use anyhow::Result;
use itertools::Itertools;
use mensa_upb_api::{Canteen, MenuCache};
use serde::{Deserialize, Serialize};
use serde_json::json;
use strum::IntoEnumIterator;
use sqlx::postgres::PgPoolOptions;
use tracing::{debug, error, info, level_filters::LevelFilter};
use tracing_subscriber::EnvFilter;
#[tokio::main]
async fn main() -> io::Result<()> {
async fn main() -> Result<()> {
let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::WARN.into())
.from_env()
@ -27,6 +24,9 @@ async fn main() -> io::Result<()> {
Err(_) => {}
}
let db = PgPoolOptions::new()
.connect_lazy(&env::var("DATABASE_URL").expect("missing DATABASE_URL env variable"))?;
let interface = env::var("API_INTERFACE").unwrap_or("127.0.0.1".to_string());
let port = env::var("API_PORT")
.ok()
@ -51,13 +51,11 @@ async fn main() -> io::Result<()> {
.unwrap_or_default();
let governor_conf = GovernorConfigBuilder::default()
.per_second(seconds_replenish)
.seconds_per_request(seconds_replenish)
.burst_size(burst_size)
.finish()
.unwrap();
let menu_cache = MenuCache::default();
info!("Starting server on {}:{}", interface, port);
HttpServer::new(move || {
@ -71,62 +69,12 @@ async fn main() -> io::Result<()> {
App::new()
.wrap(Governor::new(&governor_conf))
.wrap(cors)
.app_data(web::Data::new(menu_cache.clone()))
.service(index)
.service(menu_today)
.app_data(web::Data::new(db.clone()))
.configure(mensa_upb_api::endpoints::configure)
})
.bind((interface.as_str(), port))?
.run()
.await
}
#[get("/")]
async fn index() -> impl Responder {
HttpResponse::Ok().json(json!({
"version": env!("CARGO_PKG_VERSION"),
"description": env!("CARGO_PKG_DESCRIPTION"),
"supportedCanteens": Canteen::iter().map(|c| c.get_identifier().to_string()).collect_vec(),
}))
}
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
struct MenuQuery {
#[serde(rename = "d")]
days_ahead: Option<String>,
}
#[get("/menu/{canteen}")]
async fn menu_today(
cache: web::Data<MenuCache>,
path: web::Path<String>,
query: web::Query<MenuQuery>,
) -> impl Responder {
let canteens = path
.into_inner()
.split(',')
.map(Canteen::from_str)
.collect_vec();
if canteens.iter().all(Result::is_ok) {
let canteens = canteens.into_iter().filter_map(Result::ok).collect_vec();
let days_ahead = query
.days_ahead
.as_ref()
.map_or(Ok(0), |d| d.parse::<i64>());
if let Ok(days_ahead) = days_ahead {
let date = (Utc::now() + CDuration::days(days_ahead)).date_naive();
let menu = cache.get_combined(&canteens, date).await;
HttpResponse::Ok().json(menu)
} else {
HttpResponse::BadRequest().json(json!({
"error": "Invalid days query"
}))
}
} else {
HttpResponse::BadRequest().json(json!({
"error": "Invalid canteen identifier",
"invalid": canteens.into_iter().filter_map(|c| c.err()).collect_vec()
}))
}
.await?;
Ok(())
}

View File

@ -1,19 +1,70 @@
use anyhow::Result;
use std::str::FromStr as _;
use chrono::NaiveDate;
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use crate::{Canteen, CustomError, Dish};
use crate::{Canteen, Dish, DishPrices};
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Menu {
date: NaiveDate,
main_dishes: Vec<Dish>,
side_dishes: Vec<Dish>,
desserts: Vec<Dish>,
}
impl Menu {
pub async fn new(day: NaiveDate, canteen: Canteen) -> Result<Self> {
scrape_menu(canteen, day).await
pub async fn query(db: &PgPool, date: NaiveDate, canteens: &[Canteen]) -> sqlx::Result<Self> {
let canteens = canteens
.iter()
.map(|c| c.get_identifier().to_string())
.collect::<Vec<_>>();
let result = sqlx::query!("SELECT name, array_agg(DISTINCT canteen ORDER BY canteen) AS canteens, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
FROM meals WHERE date = $1 AND canteen = ANY($2)
GROUP BY name, dish_type, image_src, price_students, price_employees, price_guests, vegan, vegetarian
ORDER BY name",
date, &canteens)
.fetch_all(db)
.await?;
let mut main_dishes = Vec::new();
let mut side_dishes = Vec::new();
let mut desserts = Vec::new();
for row in result {
let dish = Dish {
name: row.name,
image_src: row.image_src,
canteens: row.canteens.map_or_else(Vec::new, |canteens| {
canteens
.iter()
.map(|canteen| Canteen::from_str(canteen).expect("Invalid database entry"))
.collect()
}),
vegan: row.vegan,
vegetarian: row.vegetarian,
price: DishPrices {
students: row.price_students.with_prec(5).with_scale(2),
employees: row.price_employees.with_prec(5).with_scale(2),
guests: row.price_guests.with_prec(5).with_scale(2),
},
};
if row.dish_type == "main" {
main_dishes.push(dish);
} else if row.dish_type == "side" {
side_dishes.push(dish);
} else if row.dish_type == "dessert" {
desserts.push(dish);
}
}
Ok(Self {
date,
main_dishes,
side_dishes,
desserts,
})
}
pub fn get_main_dishes(&self) -> &[Dish] {
@ -55,59 +106,11 @@ impl Menu {
}
}
main_dishes.sort_by(|a, b| a.get_name().cmp(b.get_name()));
side_dishes.sort_by(|a, b| a.get_name().cmp(b.get_name()));
desserts.sort_by(|a, b| a.get_name().cmp(b.get_name()));
Self {
date: self.date,
main_dishes,
side_dishes,
desserts,
}
}
}
async fn scrape_menu(canteen: Canteen, day: NaiveDate) -> Result<Menu> {
let url = canteen.get_url();
let client = reqwest::Client::new();
let request_builder = client
.post(url)
.query(&[("tx_pamensa_mensa[date]", day.format("%Y-%m-%d").to_string())]);
let response = request_builder.send().await?;
let html_content = response.text().await?;
let document = scraper::Html::parse_document(&html_content);
let html_main_dishes_selector = scraper::Selector::parse(
"table.table-dishes.main-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_main_dishes = document.select(&html_main_dishes_selector);
let main_dishes = html_main_dishes
.filter_map(|dish| Dish::from_element(dish, canteen))
.collect::<Vec<_>>();
let html_side_dishes_selector = scraper::Selector::parse(
"table.table-dishes.side-dishes > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_side_dishes = document.select(&html_side_dishes_selector);
let side_dishes = html_side_dishes
.filter_map(|dish| Dish::from_element(dish, canteen))
.collect::<Vec<_>>();
let html_desserts_selector = scraper::Selector::parse(
"table.table-dishes.soups > tbody > tr.odd > td.description > div.row",
)
.map_err(|_| CustomError::from("Failed to parse selector"))?;
let html_desserts = document.select(&html_desserts_selector);
let desserts = html_desserts
.filter_map(|dish| Dish::from_element(dish, canteen))
.collect::<Vec<_>>();
Ok(Menu {
main_dishes,
side_dishes,
desserts,
})
}