CHUNYU0505 commited on
Commit
772ae76
·
1 Parent(s): 1682c26

Add application file

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------------------
2
+ # 1. 匯入套件
3
+ # -------------------------------
4
+ import os, glob, time
5
+ from langchain.docstore.document import Document
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.chat_models import ChatHuggingFaceHub
10
+ from langchain.chains import RetrievalQA
11
+ from docx import Document as DocxDocument
12
+ import gradio as gr
13
+
14
+ # -------------------------------
15
+ # 2. 設定路徑
16
+ # -------------------------------
17
+ txt_folder = "out_texts" # 放你的 .txt 檔
18
+ db_path = "faiss_db"
19
+ os.makedirs(db_path, exist_ok=True)
20
+
21
+ # -------------------------------
22
+ # 3. 建立 embeddings
23
+ # -------------------------------
24
+ embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+ # -------------------------------
27
+ # 4. 建立或載入向量資料庫
28
+ # -------------------------------
29
+ if os.path.exists(os.path.join(db_path, "index.faiss")):
30
+ print("載入現有向量資料庫...")
31
+ db = FAISS.load_local(db_path, embeddings_model, allow_dangerous_deserialization=True)
32
+ else:
33
+ print("沒有資料庫,開始建立新向量資料庫...")
34
+ txt_files = glob.glob(f"{txt_folder}/*.txt")
35
+ docs = []
36
+ for filepath in txt_files:
37
+ with open(filepath, "r", encoding="utf-8") as f:
38
+ docs.append(Document(page_content=f.read(), metadata={"source": os.path.basename(filepath)}))
39
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
40
+ split_docs = text_splitter.split_documents(docs)
41
+ print("產生向量嵌入中...")
42
+ db = FAISS.from_documents(split_docs, embeddings_model)
43
+ db.save_local(db_path)
44
+ print("向量資料庫建立完成。")
45
+
46
+ # -------------------------------
47
+ # 5. Hugging Face 模型設定
48
+ # -------------------------------
49
+ HUGGINGFACE_API_TOKEN = os.getenv("HF_TOKEN") # 建議在 Spaces Secrets 設定
50
+
51
+ MODEL_DICT = {
52
+ "google/flan-t5-large": 512,
53
+ "tiiuae/falcon-7b-instruct": 512
54
+ }
55
+
56
+ MAX_HOURLY_REQUESTS = 50
57
+ request_count = 0
58
+ last_reset_time = time.time()
59
+
60
+ # -------------------------------
61
+ # 6. RAG 主函式
62
+ # -------------------------------
63
+ def rag_generate_hfapi(query, model_name, segments=5, max_words=1500):
64
+ global request_count, last_reset_time
65
+ if time.time() - last_reset_time > 3600:
66
+ request_count = 0
67
+ last_reset_time = time.time()
68
+
69
+ if request_count >= MAX_HOURLY_REQUESTS:
70
+ return f"本小時生成次數已達上限 ({MAX_HOURLY_REQUESTS}),請稍後再試。", None
71
+
72
+ llm = ChatHuggingFaceHub(
73
+ repo_id=model_name,
74
+ model_kwargs={"temperature": 0.7, "max_new_tokens": MODEL_DICT[model_name]},
75
+ huggingfacehub_api_token=HUGGINGFACE_API_TOKEN
76
+ )
77
+
78
+ qa_chain = RetrievalQA.from_chain_type(
79
+ llm=llm,
80
+ retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
81
+ return_source_documents=True
82
+ )
83
+
84
+ prompt = f"""請依據下列主題生成一篇文章:
85
+ 主題:{query}
86
+ 需求:
87
+ - 總共 {segments} 段
88
+ - 每段約 {max_words // segments} 字
89
+ - 總字數請控制在 {max_words} 字以內
90
+ - 請自動分段輸出
91
+ """
92
+
93
+ try:
94
+ result = qa_chain({"query": prompt})
95
+ full_text = result["result"].strip()
96
+ if not full_text:
97
+ full_text = "(生成失敗,請改用其他模型或調整段落數)"
98
+ except Exception as e:
99
+ return f"(生成失敗:{str(e)})", None
100
+
101
+ request_count += 1
102
+
103
+ paragraphs = [p.strip() for p in full_text.split("\n") if p.strip()]
104
+
105
+ docx_file = "generated_article.docx"
106
+ doc = DocxDocument()
107
+ doc.add_heading(query, level=1)
108
+ for p in paragraphs:
109
+ doc.add_paragraph(p)
110
+ doc.save(docx_file)
111
+
112
+ return "\n\n".join(paragraphs), docx_file
113
+
114
+ # -------------------------------
115
+ # 7. Gradio 介面
116
+ # -------------------------------
117
+ iface = gr.Interface(
118
+ fn=rag_generate_hfapi,
119
+ inputs=[
120
+ gr.Textbox(lines=2, placeholder="請輸入文章主題"),
121
+ gr.Dropdown(list(MODEL_DICT.keys()), value="google/flan-t5-large", label="選擇模型"),
122
+ gr.Slider(minimum=1, maximum=10, value=5, step=1, label="段落數"),
123
+ gr.Slider(minimum=500, maximum=3000, value=1500, step=100, label="文章字數上限")
124
+ ],
125
+ outputs=[
126
+ gr.Textbox(label="生成文章"),
127
+ gr.File(label="下載 DOCX")
128
+ ],
129
+ title="佛教經論 RAG 系統 (Hugging Face API)",
130
+ description="使用 Hugging Face API 生成文章,可選大模型,分段生成並下載 DOCX,每小時生成次數有限制"
131
+ )
132
+
133
+ iface.launch()