shulkerscript-lang/scripts/update_grammar_md.py

#!/usr/bin/env python3

import re
import os
from pathlib import Path
from collections import defaultdict, deque

ebnf_blocks = []
rule_defs = {}
rule_deps = defaultdict(set)

ebnf_fence_start = re.compile(r"^\s*///\s*```\s*ebnf\s*$")
ebnf_fence_end = re.compile(r"^\s*///\s*```\s*$")
doc_comment_prefix = re.compile(r"^\s*///\s?(.*)$")

rule_start_pattern = re.compile(r"^\s*([A-Za-z_]\w*)\s*:")
rule_ref_pattern = re.compile(r"\b([A-Za-z_]\w*)\b")


def find_project_root() -> Path | None:
    current = Path.cwd()
    while current != current.parent:
        cargo_toml = current / "Cargo.toml"
        if cargo_toml.exists():
            text = cargo_toml.read_text(encoding="utf-8")
            if re.search(r'(?m)^\s*name\s*=\s*"shulkerscript"\s*$', text):
                return current
        current = current.parent
    return None


root_dir = find_project_root()
if not root_dir:
    raise SystemExit(
        "Could not find Cargo.toml of package 'shulkerscript' in this or any parent directory."
    )

if Path.cwd() != root_dir:
    os.chdir(root_dir)
    print(f"Changed working directory to {root_dir}")

previous_rules = set()

with open("grammar.md", "r", encoding="utf-8") as f:
    rule_header_pattern = re.compile(r"## (\w+)")
    for line in f:
        m = rule_header_pattern.match(line)
        if m:
            previous_rules.add(m.group(1))

for path in Path(".").rglob("*.rs"):
    with path.open(encoding="utf-8") as f:
        in_block = False
        current_block_lines = []

        for line in f:
            if not in_block and ebnf_fence_start.match(line):
                in_block = True
                current_block_lines = []
                continue
            if in_block:
                if ebnf_fence_end.match(line):
                    block_text = "\n".join(current_block_lines)

                    ebnf_blocks.append(block_text)

                    current_rule_name = None
                    current_rule_lines = []
                    for ln in current_block_lines:
                        m = rule_start_pattern.match(ln)
                        if m:
                            if current_rule_name:
                                full_def = "\n".join(current_rule_lines)
                                rule_defs[current_rule_name] = full_def
                                refs = set(rule_ref_pattern.findall(full_def))
                                refs.discard(current_rule_name)
                                rule_deps[current_rule_name].update(refs)
                            current_rule_name = m.group(1)
                            current_rule_lines = [ln]
                        else:
                            if current_rule_name:
                                current_rule_lines.append(ln)

                        if current_rule_name:
                            full_def = "\n".join(current_rule_lines)

                            rule_defs[current_rule_name] = full_def
                            refs = set(rule_ref_pattern.findall(full_def))
                            refs.discard(current_rule_name)
                            rule_deps[current_rule_name].update(refs)

                        in_block = False
                        continue

                m = doc_comment_prefix.match(line)
                if m:
                    current_block_lines.append(m.group(1))

if "Program" not in rule_defs:
    raise SystemExit("Root rule 'Program' not found in EBNF definitions")

visited = set()
order = []
queue = deque(["Program"])

while queue:
    rule = queue.popleft()
    if rule not in visited and rule in rule_defs:
        visited.add(rule)
        order.append(rule)
        for dep in sorted(rule_deps[rule]):
            if dep not in visited:
                queue.append(dep)

unused_rules = sorted(set(rule_defs.keys()) - visited)

if len(unused_rules) > 0:
    print(
        f"Appending {len(unused_rules)} unused rules to the end: {', '.join(unused_rules)}"
    )

order.extend(unused_rules)

with open("grammar.md", "w", encoding="utf-8") as out:
    out.write("# Grammar of the Shulkerscript language\n\n")

    for rule in order:
        out.write(f"## {rule}\n\n```ebnf\n{rule_defs[rule]}\n```\n\n")

print(f"Wrote grammar.md with {len(order)} rules.")
added_rules = set(rule_defs.keys()) - previous_rules
if len(added_rules) > 0:
    print(f"Added rules for: {', '.join(added_rules)}")
removed_rules = previous_rules - set(rule_defs.keys())
if len(removed_rules) > 0:
    print(f"Removed rules for: {', '.join(removed_rules)}")