Spaces:

dwarkesh
/

transcript-aligner

Running

App Files Files Community

dwarkesh commited on Apr 2

Commit

fb1eceb

verified ·

1 Parent(s): 214a4d6

Update app.py

Browse files

Files changed (1) hide show

app.py +250 -153

app.py CHANGED Viewed

@@ -1,195 +1,248 @@
 import gradio as gr
 import re
 import difflib
 from typing import List, Dict, Tuple, Optional
 from dataclasses import dataclass
 @dataclass
 class Segment:
-    """Represents a transcript segment"""
     speaker: str
     timestamp: str
     text: str
-    raw_text: str  # For matching purposes - original text without formatting
 def parse_transcript(transcript: str) -> List[Segment]:
-    """Parse a transcript into segments, handling both markdown and plain formats"""
-    # This pattern matches both markdown and plain text formats:
-    # - "**Speaker X** *00:00:00*" or "Speaker X 00:00:00"
-    pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
-    segments = []
-    for match in re.finditer(pattern, transcript, re.DOTALL):
         speaker, timestamp, text = match.groups()
-        # Remove any markdown formatting for matching purposes
-        raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip())
-        segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
     return segments
-def clean_text_for_comparison(text: str) -> str:
-    """Clean text for better comparison"""
-    # Remove all markdown, punctuation, and lowercase for better matching
-    text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text)
-    text = re.sub(r'[^\w\s]', '', text.lower())
-    return text.strip()
-def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]:
-    """Match segments between auto and human transcripts using text similarity
-    Returns list of tuples (auto_index, human_index)"""
-    matches = []
-    # Prepare clean versions of texts for comparison
-    auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments]
-    human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments]
-    # Try to match each human segment to an auto segment
-    for human_idx, human_text in enumerate(human_texts):
-        best_match_idx = -1
-        best_similarity = 0
-        for auto_idx, auto_text in enumerate(auto_texts):
-            # Skip if this auto segment is already matched
-            if any(match[0] == auto_idx for match in matches):
-                continue
-            # Calculate similarity
-            similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio()
-            if similarity > best_similarity and similarity >= 0.6:  # Threshold
-                best_similarity = similarity
-                best_match_idx = auto_idx
-        if best_match_idx >= 0:
-            matches.append((best_match_idx, human_idx))
-    return matches
-def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str:
-    """Update timestamps in human transcript based on matches"""
-    updated_segments = human_segments.copy()
-    # Update timestamps based on matches
-    for auto_idx, human_idx in matches:
-        # Keep the human-edited text, update only the timestamp
-        updated_segments[human_idx] = Segment(
-            speaker=human_segments[human_idx].speaker,
-            timestamp=auto_segments[auto_idx].timestamp,
-            text=human_segments[human_idx].text,
-            raw_text=human_segments[human_idx].raw_text
-        )
-    # Determine if the human transcript uses markdown formatting
-    is_markdown = "**" in human_segments[0].text or "*" in human_segments[0].timestamp if human_segments else False
-    # Generate the updated transcript
-    result = []
-    for segment in updated_segments:
-        if is_markdown:
-            result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
         else:
-            result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
-    return "\n\n".join(result)
-def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
-    """Get indices of auto segments that weren't matched to any human segment"""
-    matched_auto_indices = {match[0] for match in matches}
-    return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
-def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
-    """Get indices of human segments that weren't matched to any auto segment"""
-    matched_human_indices = {match[1] for match in matches}
-    return [i for i in range(len(human_segments)) if i not in matched_human_indices]
-def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str:
-    """Format segments for display"""
-    if not indices:
-        return "None"
-    result = []
-    for idx in indices:
-        segment = segments[idx]
-        if is_markdown:
-            result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
-        else:
-            result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
-    return "\n\n".join(result)
-def process_transcripts(auto_transcript: str, human_transcript: str):
-    """Process transcripts and update timestamps"""
-    # Parse transcripts
-    auto_segments = parse_transcript(auto_transcript)
-    human_segments = parse_transcript(human_transcript)
-    # Basic validation
-    if not auto_segments or not human_segments:
-        return "Error: Could not parse transcripts. Check formatting.", "", ""
-    # Match segments
-    matches = match_segments(auto_segments, human_segments)
-    # Find unmatched segments
-    unmatched_auto = get_unmatched_auto_segments(auto_segments, matches)
-    unmatched_human = get_unmatched_human_segments(human_segments, matches)
-    # Determine if the format uses markdown
-    is_markdown = "**" in human_transcript or "*" in human_transcript
-    # Update timestamps
-    updated_transcript = update_timestamps(auto_segments, human_segments, matches)
-    # Format statistics
-    stats = f"### Matching Statistics\n\n"
-    stats += f"- Auto-generated segments: {len(auto_segments)}\n"
-    stats += f"- Human-edited segments: {len(human_segments)}\n"
-    stats += f"- Matched segments: {len(matches)}\n"
-    stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n"
-    stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n"
-    # Format unmatched segments
-    if unmatched_auto:
-        stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n"
-        stats += format_segments(auto_segments, unmatched_auto, is_markdown)
-    if unmatched_human:
-        stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n"
-        stats += format_segments(human_segments, unmatched_human, is_markdown)
-    return updated_transcript, stats
 # Create Gradio interface
-with gr.Blocks(title="Transcript Timestamp Updater") as demo:
     gr.Markdown("""
-    # 🎙️ Transcript Timestamp Updater
-    This tool updates timestamps in human-edited transcripts based on auto-generated transcripts.
     ## Instructions:
-    1. Paste your new auto-generated transcript (with updated timestamps)
-    2. Paste your human-edited transcript (with old timestamps)
-    3. Click "Update Timestamps"
-    The tool will match segments between transcripts and update the timestamps while preserving all human edits.
     """)
     with gr.Row():
         with gr.Column():
-            auto_transcript = gr.TextArea(
-                label="Auto-Generated Transcript (with new timestamps)",
                 placeholder="Paste the auto-generated transcript here...",
-                lines=15
             )
         with gr.Column():
-            human_transcript = gr.TextArea(
-                label="Human-Edited Transcript (with old timestamps)",
                 placeholder="Paste the human-edited transcript here...",
-                lines=15
             )
-    update_btn = gr.Button("Update Timestamps")
     with gr.Tabs():
         with gr.TabItem("Updated Transcript"):
@@ -198,19 +251,63 @@ with gr.Blocks(title="Transcript Timestamp Updater") as demo:
                 placeholder="The updated transcript will appear here...",
                 lines=20
             )
-        with gr.TabItem("Statistics"):
-            stats = gr.Markdown(
-                label="Statistics",
-                value="Statistics will appear here..."
             )
     update_btn.click(
-        fn=process_transcripts,
-        inputs=[auto_transcript, human_transcript],
-        outputs=[updated_transcript, stats]
     )
-# Launch the app
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import re
 import difflib
+import os
 from typing import List, Dict, Tuple, Optional
 from dataclasses import dataclass
+import numpy as np
 @dataclass
 class Segment:
+    """A segment of a transcript with a speaker and text"""
     speaker: str
     timestamp: str
     text: str
+    original_text: str  # The text as it appears in the original transcript
+    index: int  # Position in the original transcript
+def clean_text_for_matching(text: str) -> str:
+    """Clean text for matching purposes (remove formatting, punctuation, etc.)"""
+    # Remove markdown links and formatting
+    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)  # Replace markdown links with just the text
+    text = re.sub(r'\*\*|\*', '', text)  # Remove bold and italic formatting
+    # Remove common filler words and punctuation for better matching
+    text = re.sub(r'[,.;:!?]', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.lower().strip()
+def load_transcript_file(file_path: str) -> str:
+    """Load transcript from a file"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
 def parse_transcript(transcript: str) -> List[Segment]:
+    """
+    Parse transcript into segments.
+    Works with both formats:
+    - Speaker LastName 00:00:00
+    - **Speaker LastName** *00:00:00*
+    """
+    # Match both markdown and plain formats
+    pattern = r"(?:\*\*)?(?:Speaker\s+)?([A-Za-z]+)(?:\*\*)?\s+(?:\*)?([0-9:]+)(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker\s+)?[A-Za-z]+|\Z)"
+    segments = []
+    for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
         speaker, timestamp, text = match.groups()
+        original_text = text.strip()
+        cleaned_text = clean_text_for_matching(original_text)
+        segments.append(Segment(speaker, timestamp, cleaned_text, original_text, i))
     return segments
+def align_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> Dict[int, int]:
+    """
+    Align segments from human-edited transcript to auto-generated transcript.
+    Returns a dictionary mapping human segment indices to auto segment indices.
+    """
+    alignments = {}
+    # Create text similarity matrix
+    similarity_matrix = np.zeros((len(human_segments), len(auto_segments)))
+    for h_idx, h_segment in enumerate(human_segments):
+        for a_idx, a_segment in enumerate(auto_segments):
+            similarity = difflib.SequenceMatcher(None, h_segment.text, a_segment.text).ratio()
+            similarity_matrix[h_idx, a_idx] = similarity
+    # Find best matches while maintaining order
+    remaining_auto_indices = set(range(len(auto_segments)))
+    for h_idx, h_segment in enumerate(human_segments):
+        # Find the best matching auto segment that hasn't been assigned yet
+        best_match = -1
+        best_similarity = 0.5  # Threshold for considering a match
+        for a_idx in remaining_auto_indices:
+            similarity = similarity_matrix[h_idx, a_idx]
+            if similarity > best_similarity:
+                # Check if this would violate sequence ordering
+                if all(aligned_a_idx < a_idx for aligned_h_idx, aligned_a_idx in alignments.items() if aligned_h_idx < h_idx):
+                    best_match = a_idx
+                    best_similarity = similarity
+        if best_match >= 0:
+            alignments[h_idx] = best_match
+            remaining_auto_indices.remove(best_match)
+    return alignments
+def update_transcript(human_segments: List[Segment], auto_segments: List[Segment],
+                      alignments: Dict[int, int], is_markdown: bool) -> str:
+    """
+    Create updated transcript by transferring timestamps from auto segments to human segments.
+    Preserves all human edits, formatting, links, etc.
+    """
+    updated_segments = []
+    for h_idx, h_segment in enumerate(human_segments):
+        if h_idx in alignments:
+            # Segment was matched, use timestamp from auto segment
+            a_idx = alignments[h_idx]
+            if is_markdown:
+                updated_segments.append(f"**{h_segment.speaker}** *{auto_segments[a_idx].timestamp}*\n\n{h_segment.original_text}")
+            else:
+                updated_segments.append(f"Speaker {h_segment.speaker} {auto_segments[a_idx].timestamp}\n\n{h_segment.original_text}")
         else:
+            # No match found, keep original timestamp but mark it
+            if is_markdown:
+                updated_segments.append(f"**{h_segment.speaker}** *{h_segment.timestamp} [NO MATCH]*\n\n{h_segment.original_text}")
+            else:
+                updated_segments.append(f"Speaker {h_segment.speaker} {h_segment.timestamp} [NO MATCH]\n\n{h_segment.original_text}")
+    return "\n\n".join(updated_segments)
+def generate_match_report(human_segments: List[Segment], auto_segments: List[Segment],
+                         alignments: Dict[int, int]) -> str:
+    """Generate a report about the matching process"""
+    total_human = len(human_segments)
+    total_auto = len(auto_segments)
+    total_matched = len(alignments)
+    report = f"### Matching Report\n\n"
+    report += f"- Human segments: {total_human}\n"
+    report += f"- Auto segments: {total_auto}\n"
+    report += f"- Matched segments: {total_matched} ({total_matched/total_human*100:.1f}%)\n"
+    if total_matched < total_human:
+        report += f"\n### Unmatched Segments ({total_human - total_matched})\n\n"
+        for h_idx, h_segment in enumerate(human_segments):
+            if h_idx not in alignments:
+                report += f"- Speaker {h_segment.speaker} at {h_segment.timestamp}: '{h_segment.text[:50]}...'\n"
+    # Calculate average similarity of matches
+    if alignments:
+        similarities = [
+            difflib.SequenceMatcher(None,
+                                    human_segments[h_idx].text,
+                                    auto_segments[a_idx].text).ratio()
+            for h_idx, a_idx in alignments.items()
+        ]
+        avg_similarity = sum(similarities) / len(similarities)
+        report += f"\n### Match Quality\n\n"
+        report += f"- Average similarity: {avg_similarity:.2f}\n"
+    return report
+def process_transcripts(auto_transcript, human_transcript):
+    """Process the auto and human transcripts to update timestamps"""
+    try:
+        # Load transcripts
+        auto_content = auto_transcript.decode('utf-8') if isinstance(auto_transcript, bytes) else auto_transcript
+        human_content = human_transcript.decode('utf-8') if isinstance(human_transcript, bytes) else human_transcript
+        # Check if transcripts use markdown formatting
+        is_markdown = "**" in human_content
+        # Parse transcripts
+        auto_segments = parse_transcript(auto_content)
+        human_segments = parse_transcript(human_content)
+        if not auto_segments or not human_segments:
+            return "Error: Could not parse transcripts. Please check the format.", ""
+        # Align segments
+        alignments = align_segments(auto_segments, human_segments)
+        # Update transcript
+        updated_transcript = update_transcript(human_segments, auto_segments, alignments, is_markdown)
+        # Generate report
+        report = generate_match_report(human_segments, auto_segments, alignments)
+        return updated_transcript, report
+    except Exception as e:
+        return f"Error processing transcripts: {str(e)}", ""
+def save_transcript(transcript: str) -> str:
+    """Save transcript to a temporary file and return the path"""
+    output_dir = "output"
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    output_path = os.path.join(output_dir, "updated_transcript.md")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(transcript)
+    return output_path
 # Create Gradio interface
+with gr.Blocks(title="Transcript Timestamp Synchronizer") as demo:
     gr.Markdown("""
+    # 🎙️ Transcript Timestamp Synchronizer
+    This tool updates timestamps in human-edited transcripts based on new auto-generated transcripts.
     ## Instructions:
+    1. Upload or paste your new auto-generated transcript (with updated timestamps)
+    2. Upload or paste your human-edited transcript (with old timestamps)
+    3. Click "Synchronize Timestamps" to generate an updated transcript
+    The tool will match segments between the transcripts and update the timestamps while preserving all human edits.
     """)
     with gr.Row():
         with gr.Column():
+            auto_source = gr.Radio(
+                ["Upload File", "Paste Text"],
+                label="Auto-generated Transcript Source",
+                value="Paste Text"
+            )
+            auto_file = gr.File(
+                label="Upload Auto-generated Transcript",
+                file_types=[".md", ".txt"],
+                visible=False
+            )
+            auto_text = gr.TextArea(
+                label="Auto-generated Transcript (with new timestamps)",
                 placeholder="Paste the auto-generated transcript here...",
+                lines=15,
+                visible=True
             )
         with gr.Column():
+            human_source = gr.Radio(
+                ["Upload File", "Paste Text"],
+                label="Human-edited Transcript Source",
+                value="Paste Text"
+            )
+            human_file = gr.File(
+                label="Upload Human-edited Transcript",
+                file_types=[".md", ".txt"],
+                visible=False
+            )
+            human_text = gr.TextArea(
+                label="Human-edited Transcript (with old timestamps)",
                 placeholder="Paste the human-edited transcript here...",
+                lines=15,
+                visible=True
             )
+    update_btn = gr.Button("Synchronize Timestamps")
     with gr.Tabs():
         with gr.TabItem("Updated Transcript"):
                 placeholder="The updated transcript will appear here...",
                 lines=20
             )
+            download_btn = gr.Button("Download Updated Transcript")
+            download_path = gr.File(label="Download", visible=False)
+        with gr.TabItem("Matching Report"):
+            matching_report = gr.Markdown(
+                label="Matching Report",
+                value="The matching report will appear here..."
             )
+    # Handle visibility of upload/paste options
+    def update_auto_visibility(choice):
+        return gr.update(visible=choice=="Upload File"), gr.update(visible=choice=="Paste Text")
+    def update_human_visibility(choice):
+        return gr.update(visible=choice=="Upload File"), gr.update(visible=choice=="Paste Text")
+    auto_source.change(update_auto_visibility, auto_source, [auto_file, auto_text])
+    human_source.change(update_human_visibility, human_source, [human_file, human_text])
+    # Load file content if uploaded
+    def load_auto_file(file):
+        if file is None:
+            return ""
+        with open(file.name, "r", encoding="utf-8") as f:
+            return f.read()
+    def load_human_file(file):
+        if file is None:
+            return ""
+        with open(file.name, "r", encoding="utf-8") as f:
+            return f.read()
+    auto_file.change(load_auto_file, auto_file, auto_text)
+    human_file.change(load_human_file, human_file, human_text)
+    # Process transcripts
+    def handle_process(auto_content, human_content):
+        return process_transcripts(auto_content, human_content)
     update_btn.click(
+        fn=handle_process,
+        inputs=[auto_text, human_text],
+        outputs=[updated_transcript, matching_report]
+    )
+    # Handle download
+    def prepare_download(transcript):
+        if not transcript:
+            return None
+        return save_transcript(transcript)
+    download_btn.click(
+        fn=prepare_download,
+        inputs=[updated_transcript],
+        outputs=[download_path]
     )
+# For local testing
 if __name__ == "__main__":
     demo.launch()