add-text-preprocessing

#2
Files changed (2) hide show
  1. src/App.tsx +3 -1
  2. src/text-preprocessor.ts +85 -0
src/App.tsx CHANGED
@@ -3,6 +3,7 @@ import { Zap, AlignLeft, Quote, Type, FileText, Check, X, Dices } from "lucide-r
3
  import { useTTS } from "./components/TTSContext";
4
  import { TTSProvider } from "./components/TTSProvider";
5
  import { streamTTS, createAudioBlob } from "./tts";
 
6
  import { SAMPLE_RATE, EXAMPLE_SENTENCES } from "./constants";
7
  import { AudioResult } from "./components/AudioResult";
8
  import { Controls } from "./components/Controls";
@@ -155,7 +156,8 @@ const AppContent = () => {
155
  if (!tts.current || !speakerEmbeddings.current) throw new Error("TTS pipeline not ready");
156
  const selectedEmbedding = speakerEmbeddings.current[voice];
157
 
158
- for await (const result of streamTTS(text, tts.current, selectedEmbedding, quality, speed)) {
 
159
  if (stopGenerationRef.current) {
160
  break;
161
  }
 
3
  import { useTTS } from "./components/TTSContext";
4
  import { TTSProvider } from "./components/TTSProvider";
5
  import { streamTTS, createAudioBlob } from "./tts";
6
+ import { preprocessText } from "./text-preprocessor";
7
  import { SAMPLE_RATE, EXAMPLE_SENTENCES } from "./constants";
8
  import { AudioResult } from "./components/AudioResult";
9
  import { Controls } from "./components/Controls";
 
156
  if (!tts.current || !speakerEmbeddings.current) throw new Error("TTS pipeline not ready");
157
  const selectedEmbedding = speakerEmbeddings.current[voice];
158
 
159
+ const preprocessedText = preprocessText(text);
160
+ for await (const result of streamTTS(preprocessedText, tts.current, selectedEmbedding, quality, speed)) {
161
  if (stopGenerationRef.current) {
162
  break;
163
  }
src/text-preprocessor.ts ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation.
3
+ * Based on: https://github.com/supertone-inc/supertonic/blob/main/csharp/Helper.cs
4
+ * @param text The input text to preprocess.
5
+ * @returns The cleaned and preprocessed text.
6
+ */
7
+ export function preprocessText(text: string): string {
8
+ // Normalize to NFKD form to separate characters from their diacritics.
9
+ text = text.normalize("NFKD");
10
+
11
+ // Remove emojis.
12
+ text = text.replace(
13
+ /([\u2600-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|\uD83E[\uDD10-\uDDFF])/g,
14
+ "",
15
+ );
16
+
17
+ // Define character replacements.
18
+ const replacements: Record<string, string> = {
19
+ "—": "-", // em dash
20
+ "–": "-", // en dash
21
+ "‑": "-", // non-breaking hyphen
22
+ "¯": " ", // macron
23
+ "_": " ", // underscore
24
+ "\u201C": '"', // left double quote
25
+ "\u201D": '"', // right double quote
26
+ "\u2018": "'", // left single quote
27
+ "\u2019": "'", // right single quote
28
+ "´": "'", // acute accent
29
+ "`": "'", // grave accent
30
+ "[": " ",
31
+ "]": " ",
32
+ "|": " ",
33
+ "/": " ",
34
+ "#": " ",
35
+ "→": " ",
36
+ "←": " ",
37
+ };
38
+
39
+ // Apply character replacements.
40
+ for (const [key, value] of Object.entries(replacements)) {
41
+ text = text.replace(new RegExp(`\\${key}`, "g"), value);
42
+ }
43
+
44
+ // Remove combining diacritical marks.
45
+ text = text.replace(/[\u0300-\u036f]/g, "");
46
+
47
+ // Remove special symbols that are not handled by emoji removal.
48
+ text = text.replace(/[♥☆♡©\\]/g, "");
49
+
50
+ // Replace known expressions.
51
+ const exprReplacements: Record<string, string> = {
52
+ "@": " at ",
53
+ " e.g.": " for example, ",
54
+ " i.e.": " that is, ",
55
+ };
56
+
57
+ for (const [key, value] of Object.entries(exprReplacements)) {
58
+ text = text.replace(new RegExp(key, "g"), value);
59
+ }
60
+
61
+ // Fix spacing around punctuation.
62
+ text = text
63
+ .replace(/ ,/g, ",")
64
+ .replace(/ \./g, ".")
65
+ .replace(/ !/g, "!")
66
+ .replace(/ \?/g, "?")
67
+ .replace(/ ;/g, ";")
68
+ .replace(/ :/g, ":")
69
+ .replace(/ '/g, "'");
70
+
71
+ // Remove duplicate quotes.
72
+ text = text.replace(/""+/g, '"');
73
+ text = text.replace(/''+/g, "'");
74
+ text = text.replace(/``+/g, "`");
75
+
76
+ // Remove extra spaces.
77
+ text = text.replace(/\s+/g, " ").trim();
78
+
79
+ // If text doesn't end with punctuation, quotes, or closing brackets, add a period.
80
+ if (!/[.!?;:,'")\]}…。」』】〉》›»]$/.test(text)) {
81
+ text += ".";
82
+ }
83
+
84
+ return text;
85
+ }