Spaces:
Runtime error
Runtime error
Rivalcoder
commited on
Commit
·
cbf58a5
1
Parent(s):
033ac17
Add
Browse files- Dockerfile +19 -0
- app.py +44 -0
- requirements.txt +3 -0
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Install Chrome
|
| 4 |
+
RUN apt-get update && apt-get install -y wget gnupg unzip curl \
|
| 5 |
+
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
| 6 |
+
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' \
|
| 7 |
+
&& apt-get update \
|
| 8 |
+
&& apt-get install -y google-chrome-stable
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
COPY requirements.txt /app/requirements.txt
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy app code
|
| 16 |
+
COPY app.py /app/app.py
|
| 17 |
+
|
| 18 |
+
# Run the app
|
| 19 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from selenium.webdriver.common.by import By
|
| 4 |
+
import undetected_chromedriver as uc
|
| 5 |
+
|
| 6 |
+
def get_captions_selenium(video_url):
|
| 7 |
+
try:
|
| 8 |
+
# Launch browser
|
| 9 |
+
options = uc.ChromeOptions()
|
| 10 |
+
options.add_argument("--headless")
|
| 11 |
+
options.add_argument("--no-sandbox")
|
| 12 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 13 |
+
|
| 14 |
+
driver = uc.Chrome(options=options)
|
| 15 |
+
|
| 16 |
+
driver.get(video_url)
|
| 17 |
+
time.sleep(5)
|
| 18 |
+
|
| 19 |
+
# Click "..." -> "Open transcript"
|
| 20 |
+
# YouTube UI changes often; this is just an example. May need tuning.
|
| 21 |
+
|
| 22 |
+
# Try to find subtitles in the page source (for auto-generated)
|
| 23 |
+
page_source = driver.page_source
|
| 24 |
+
if "captionTracks" in page_source:
|
| 25 |
+
start = page_source.find("captionTracks")
|
| 26 |
+
end = page_source.find("]", start) + 1
|
| 27 |
+
caption_json = page_source[start:end]
|
| 28 |
+
driver.quit()
|
| 29 |
+
return "✅ Found potential captions info in page source (you may need to parse this JSON)."
|
| 30 |
+
else:
|
| 31 |
+
driver.quit()
|
| 32 |
+
return "⚠️ Captions info not found in source. May not be available or blocked."
|
| 33 |
+
|
| 34 |
+
except Exception as e:
|
| 35 |
+
return f"❌ Error: {str(e)}"
|
| 36 |
+
|
| 37 |
+
# Gradio interface
|
| 38 |
+
gr.Interface(
|
| 39 |
+
fn=get_captions_selenium,
|
| 40 |
+
inputs=[gr.Textbox(label="YouTube Video URL")],
|
| 41 |
+
outputs="text",
|
| 42 |
+
title="YouTube Captions Scraper (Selenium)",
|
| 43 |
+
description="Extract captions using headless browser via Selenium."
|
| 44 |
+
).launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
selenium
|
| 3 |
+
undetected-chromedriver
|