improve auto-nbsp detection algorithm

2025-10-21 22:17:48 +02:00
parent bdbda1189f
commit 378bd509c9
2 changed files with 33 additions and 22 deletions
--- a/template/lang.json
+++ b/template/lang.json
@@ -70,7 +70,19 @@
 		"next_page_attachment": "začíná na další straně",
 		"attached_bellow": "dále přiloženo",

-		"place_assignment": "Sem vložte zadání"
+		"place_assignment": "Sem vložte zadání",
+
+		"break_rules": {
+			"space_after": [
+				"((?i)[kosuvzai])",
+				"(tj|tzv|tzn)\\."
+			],
+			"nonbreaking_terms": [
+				"(s\\. r\\. o|a\\. s|v\\. o\\. s)\\.",
+				"č\\. ([pe]|ev)\\.",
+				"ev?\\. č\\."
+			]
+		}
 	},

 	"en": {
--- a/template/lang.typ
+++ b/template/lang.typ
@@ -46,30 +46,29 @@
 }

 #let set_czech_nonbreakable_terms(content) = {
-  let space_after = (
-    "[kosuvzai]",
-    "(tj|tzv|tzn)\.",
+  let rules = get_lang_item("cs", "break_rules");
+  let space_after = rules.at("space_after");
+  let nonbreaking_terms = rules.at("nonbreaking_terms");
+
+  let terms = "\b(" + nonbreaking_terms.join("|") + ")";
+  let chain = (
+    "\b((" + space_after.join("|") + ") )+" +
+    "(" + terms + "|\w+\b)"
  );
-  show regex("\b((?i)(" + space_after.join("|") + ") )+\w+\b"): match => {
-    box(match);
-  }
-
-  let nonbreaking_abbreviations = (
-    "a. s",
-    "s. r. o",
-    "v. o. s",
-    "k. s",
-    "n. p",
-    "p. o",
-    "č. ([pe]|ev)",
-    "ev?. č",
-  );
-  show regex(
-    "(?i)\b(" + nonbreaking_abbreviations.map((v) => { v.replace(".", "\\.") }).join("|") + ")\."
-  ): match => {
-    box(match);
+
+  let apply_rules(exprs: ("",), content) = {
+    let res = content;
+    for expr in exprs {
+      res = {
+        show regex(expr): box;
+        res;
+      };
+    }
+    res
  }

+  show heading: apply_rules.with(exprs: (chain, terms));
+  show par: apply_rules.with(exprs: (chain, terms));
  content
 }