Spaces:
Running
Running
Upload 3 files
Browse files- mtdna_backend.py +32 -21
- pipeline.py +82 -8
- smart_fallback.py +156 -0
mtdna_backend.py
CHANGED
|
@@ -139,12 +139,12 @@ def summarize_results(accession):
|
|
| 139 |
if cached:
|
| 140 |
print(f"✅ Using cached result for {accession}")
|
| 141 |
return [[
|
| 142 |
-
cached["Sample ID"],
|
| 143 |
-
cached["Predicted Country"],
|
| 144 |
-
cached["Country Explanation"],
|
| 145 |
-
cached["Predicted Sample Type"],
|
| 146 |
-
cached["Sample Type Explanation"],
|
| 147 |
-
cached["Sources"],
|
| 148 |
cached["Time cost"]
|
| 149 |
]]
|
| 150 |
# only run when nothing in the cache
|
|
@@ -175,13 +175,15 @@ def summarize_results(accession):
|
|
| 175 |
pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
|
| 176 |
for section, results in outputs[key].items():
|
| 177 |
if section == "country" or section =="sample_type":
|
| 178 |
-
pred_output = "\n".join(list(results.keys()))
|
| 179 |
output_explanation = ""
|
| 180 |
for result, content in results.items():
|
| 181 |
if len(result) == 0: result = "unknown"
|
| 182 |
if len(content) == 0: output_explanation = "unknown"
|
| 183 |
else:
|
| 184 |
output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
|
|
|
|
|
|
|
| 185 |
if section == "country":
|
| 186 |
pred_country, country_explanation = pred_output, output_explanation
|
| 187 |
elif section == "sample_type":
|
|
@@ -191,24 +193,24 @@ def summarize_results(accession):
|
|
| 191 |
else: label = key
|
| 192 |
if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
|
| 193 |
row = {
|
| 194 |
-
"Sample ID": label,
|
| 195 |
-
"Predicted Country": pred_country,
|
| 196 |
-
"Country Explanation": country_explanation,
|
| 197 |
-
"Predicted Sample Type":pred_sample,
|
| 198 |
-
"Sample Type Explanation":sample_explanation,
|
| 199 |
-
"Sources": "\n".join(outputs[key]["source"]),
|
| 200 |
"Time cost": outputs[key]["time_cost"]
|
| 201 |
}
|
| 202 |
#row_score.append(row)
|
| 203 |
rows.append(list(row.values()))
|
| 204 |
|
| 205 |
save_row = {
|
| 206 |
-
"Sample ID": label,
|
| 207 |
-
"Predicted Country": pred_country,
|
| 208 |
-
"Country Explanation": country_explanation,
|
| 209 |
-
"Predicted Sample Type":pred_sample,
|
| 210 |
-
"Sample Type Explanation":sample_explanation,
|
| 211 |
-
"Sources": "\n".join(outputs[key]["source"]),
|
| 212 |
"Query_cost": outputs[key]["query_cost"],
|
| 213 |
"Time cost": outputs[key]["time_cost"]
|
| 214 |
}
|
|
@@ -530,14 +532,23 @@ def check_known_output(accession):
|
|
| 530 |
if "Sample ID" not in df.columns:
|
| 531 |
print("❌ Column 'Sample ID' not found in Google Sheet.")
|
| 532 |
return None
|
| 533 |
-
|
| 534 |
match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
|
| 535 |
if match:
|
| 536 |
accession = match.group(0)
|
| 537 |
|
| 538 |
matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
|
| 539 |
if not matched.empty:
|
| 540 |
-
return matched.iloc[0].to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
else:
|
| 542 |
print(f"🔍 Accession {accession} not found in known_samples.")
|
| 543 |
return None
|
|
|
|
| 139 |
if cached:
|
| 140 |
print(f"✅ Using cached result for {accession}")
|
| 141 |
return [[
|
| 142 |
+
cached["Sample ID"] or "unknown",
|
| 143 |
+
cached["Predicted Country"] or "unknown",
|
| 144 |
+
cached["Country Explanation"] or "unknown",
|
| 145 |
+
cached["Predicted Sample Type"] or "unknown",
|
| 146 |
+
cached["Sample Type Explanation"] or "unknown",
|
| 147 |
+
cached["Sources"] or "No Links",
|
| 148 |
cached["Time cost"]
|
| 149 |
]]
|
| 150 |
# only run when nothing in the cache
|
|
|
|
| 175 |
pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
|
| 176 |
for section, results in outputs[key].items():
|
| 177 |
if section == "country" or section =="sample_type":
|
| 178 |
+
pred_output = []#"\n".join(list(results.keys()))
|
| 179 |
output_explanation = ""
|
| 180 |
for result, content in results.items():
|
| 181 |
if len(result) == 0: result = "unknown"
|
| 182 |
if len(content) == 0: output_explanation = "unknown"
|
| 183 |
else:
|
| 184 |
output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
|
| 185 |
+
pred_output.append(result)
|
| 186 |
+
pred_output = "\n".join(pred_output)
|
| 187 |
if section == "country":
|
| 188 |
pred_country, country_explanation = pred_output, output_explanation
|
| 189 |
elif section == "sample_type":
|
|
|
|
| 193 |
else: label = key
|
| 194 |
if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
|
| 195 |
row = {
|
| 196 |
+
"Sample ID": label or "unknown",
|
| 197 |
+
"Predicted Country": pred_country or "unknown",
|
| 198 |
+
"Country Explanation": country_explanation or "unknown",
|
| 199 |
+
"Predicted Sample Type":pred_sample or "unknown",
|
| 200 |
+
"Sample Type Explanation":sample_explanation or "unknown",
|
| 201 |
+
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
| 202 |
"Time cost": outputs[key]["time_cost"]
|
| 203 |
}
|
| 204 |
#row_score.append(row)
|
| 205 |
rows.append(list(row.values()))
|
| 206 |
|
| 207 |
save_row = {
|
| 208 |
+
"Sample ID": label or "unknown",
|
| 209 |
+
"Predicted Country": pred_country or "unknown",
|
| 210 |
+
"Country Explanation": country_explanation or "unknown",
|
| 211 |
+
"Predicted Sample Type":pred_sample or "unknown",
|
| 212 |
+
"Sample Type Explanation":sample_explanation or "unknown",
|
| 213 |
+
"Sources": "\n".join(outputs[key]["source"]) or "No Links",
|
| 214 |
"Query_cost": outputs[key]["query_cost"],
|
| 215 |
"Time cost": outputs[key]["time_cost"]
|
| 216 |
}
|
|
|
|
| 532 |
if "Sample ID" not in df.columns:
|
| 533 |
print("❌ Column 'Sample ID' not found in Google Sheet.")
|
| 534 |
return None
|
| 535 |
+
|
| 536 |
match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
|
| 537 |
if match:
|
| 538 |
accession = match.group(0)
|
| 539 |
|
| 540 |
matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
|
| 541 |
if not matched.empty:
|
| 542 |
+
#return matched.iloc[0].to_dict()
|
| 543 |
+
row = matched.iloc[0]
|
| 544 |
+
country = row.get("Predicted Country", "").strip().lower()
|
| 545 |
+
sample_type = row.get("Predicted Sample Type", "").strip().lower()
|
| 546 |
+
|
| 547 |
+
if country and country != "unknown" and sample_type and sample_type != "unknown":
|
| 548 |
+
return row.to_dict()
|
| 549 |
+
else:
|
| 550 |
+
print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
|
| 551 |
+
return None
|
| 552 |
else:
|
| 553 |
print(f"🔍 Accession {accession} not found in known_samples.")
|
| 554 |
return None
|
pipeline.py
CHANGED
|
@@ -6,6 +6,7 @@ import data_preprocess
|
|
| 6 |
import model
|
| 7 |
import mtdna_classifier
|
| 8 |
#import app
|
|
|
|
| 9 |
import pandas as pd
|
| 10 |
from pathlib import Path
|
| 11 |
import subprocess
|
|
@@ -27,7 +28,7 @@ import io
|
|
| 27 |
import json
|
| 28 |
#––– Authentication setup –––
|
| 29 |
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
| 30 |
-
GDRIVE_DATA_FOLDER_NAME = "
|
| 31 |
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
| 32 |
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
| 33 |
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
|
@@ -216,13 +217,18 @@ def pipeline_with_gemini(accessions):
|
|
| 216 |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
| 217 |
acc_score["isolate"] = iso
|
| 218 |
print(meta)
|
|
|
|
| 219 |
# set up step: create the folder to save document
|
| 220 |
chunk, all_output = "",""
|
| 221 |
if pudID:
|
| 222 |
id = str(pudID)
|
| 223 |
saveTitle = title
|
| 224 |
else:
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
id = "DirectSubmission"
|
| 227 |
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
| 228 |
# if not folder_path.exists():
|
|
@@ -232,10 +238,13 @@ def pipeline_with_gemini(accessions):
|
|
| 232 |
# else:
|
| 233 |
# print("data/"+str(id) +" already exists.")
|
| 234 |
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
| 235 |
-
parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
| 236 |
-
data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
|
|
|
|
|
|
| 237 |
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
| 238 |
-
|
|
|
|
| 239 |
# Define document names
|
| 240 |
if len(saveTitle) > 50:
|
| 241 |
saveName = saveTitle[:50]
|
|
@@ -264,6 +273,14 @@ def pipeline_with_gemini(accessions):
|
|
| 264 |
print("✅ Files already exist in Google Drive. Downloading them...")
|
| 265 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 266 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
# Read and parse these into `chunk` and `all_output`
|
| 268 |
else:
|
| 269 |
# 🔥 Remove any stale local copies
|
|
@@ -321,7 +338,8 @@ def pipeline_with_gemini(accessions):
|
|
| 321 |
if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 322 |
# might find the article
|
| 323 |
print("no article text")
|
| 324 |
-
tem_links = mtdna_classifier.search_google_custom(title, 2)
|
|
|
|
| 325 |
# get supplementary of that article
|
| 326 |
print("tem links length ", len(tem_links))
|
| 327 |
for link in tem_links:
|
|
@@ -436,8 +454,10 @@ def pipeline_with_gemini(accessions):
|
|
| 436 |
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
| 437 |
|
| 438 |
# Upload to Drive
|
| 439 |
-
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
| 440 |
-
upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
|
|
|
|
|
|
| 441 |
print("here 1")
|
| 442 |
|
| 443 |
# else:
|
|
@@ -528,9 +548,15 @@ def pipeline_with_gemini(accessions):
|
|
| 528 |
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
|
| 529 |
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
| 530 |
# model.call_llm_api, chunk=chunk, all_output=all_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
| 532 |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
| 533 |
model.call_llm_api, chunk=chunk, all_output=all_output)
|
|
|
|
|
|
|
| 534 |
if len(country) == 0: country = "unknown"
|
| 535 |
if len(sample_type) == 0: sample_type = "unknown"
|
| 536 |
if country_explanation: country_explanation = "-"+country_explanation
|
|
@@ -571,6 +597,54 @@ def pipeline_with_gemini(accessions):
|
|
| 571 |
else:
|
| 572 |
if len(method_used + sample_type_explanation)> 0:
|
| 573 |
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
end = time.time()
|
| 575 |
total_cost_title += total_query_cost
|
| 576 |
acc_score["query_cost"] = f"{total_cost_title:.6f}"
|
|
|
|
| 6 |
import model
|
| 7 |
import mtdna_classifier
|
| 8 |
#import app
|
| 9 |
+
import smart_fallback
|
| 10 |
import pandas as pd
|
| 11 |
from pathlib import Path
|
| 12 |
import subprocess
|
|
|
|
| 28 |
import json
|
| 29 |
#––– Authentication setup –––
|
| 30 |
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
| 31 |
+
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
|
| 32 |
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
| 33 |
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
| 34 |
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
|
|
|
| 217 |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
| 218 |
acc_score["isolate"] = iso
|
| 219 |
print(meta)
|
| 220 |
+
meta_expand = smart_fallback.fetch_ncbi(acc)
|
| 221 |
# set up step: create the folder to save document
|
| 222 |
chunk, all_output = "",""
|
| 223 |
if pudID:
|
| 224 |
id = str(pudID)
|
| 225 |
saveTitle = title
|
| 226 |
else:
|
| 227 |
+
try:
|
| 228 |
+
author_name = meta_expand["authors"].split(',')[0] # Use last name only
|
| 229 |
+
except:
|
| 230 |
+
author_name = meta_expand["authors"]
|
| 231 |
+
saveTitle = title + "_" + col_date + "_" + author_name
|
| 232 |
id = "DirectSubmission"
|
| 233 |
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
| 234 |
# if not folder_path.exists():
|
|
|
|
| 238 |
# else:
|
| 239 |
# print("data/"+str(id) +" already exists.")
|
| 240 |
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
| 241 |
+
# parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
| 242 |
+
# data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
| 243 |
+
# sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
| 244 |
+
data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
|
| 245 |
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
| 246 |
+
print("sample folder id: ", sample_folder_id)
|
| 247 |
+
|
| 248 |
# Define document names
|
| 249 |
if len(saveTitle) > 50:
|
| 250 |
saveName = saveTitle[:50]
|
|
|
|
| 273 |
print("✅ Files already exist in Google Drive. Downloading them...")
|
| 274 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 275 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 276 |
+
print("chunk_id and all_id: ")
|
| 277 |
+
print(chunk_id, all_id)
|
| 278 |
+
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
| 279 |
+
print("📄 Name:", file["name"])
|
| 280 |
+
print("📁 Parent folder ID:", file["parents"][0])
|
| 281 |
+
print("🔗 View link:", file["webViewLink"])
|
| 282 |
+
|
| 283 |
+
|
| 284 |
# Read and parse these into `chunk` and `all_output`
|
| 285 |
else:
|
| 286 |
# 🔥 Remove any stale local copies
|
|
|
|
| 338 |
if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 339 |
# might find the article
|
| 340 |
print("no article text")
|
| 341 |
+
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 342 |
+
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 343 |
# get supplementary of that article
|
| 344 |
print("tem links length ", len(tem_links))
|
| 345 |
for link in tem_links:
|
|
|
|
| 454 |
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
| 455 |
|
| 456 |
# Upload to Drive
|
| 457 |
+
result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
| 458 |
+
result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
| 459 |
+
print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
|
| 460 |
+
print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
|
| 461 |
print("here 1")
|
| 462 |
|
| 463 |
# else:
|
|
|
|
| 548 |
# country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
|
| 549 |
# primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
| 550 |
# model.call_llm_api, chunk=chunk, all_output=all_output)
|
| 551 |
+
print("this is chunk for the model")
|
| 552 |
+
print(chunk)
|
| 553 |
+
print("this is all output for the model")
|
| 554 |
+
print(all_output)
|
| 555 |
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
| 556 |
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
| 557 |
model.call_llm_api, chunk=chunk, all_output=all_output)
|
| 558 |
+
print("country using ai: ", country)
|
| 559 |
+
print("sample type using ai: ", sample_type)
|
| 560 |
if len(country) == 0: country = "unknown"
|
| 561 |
if len(sample_type) == 0: sample_type = "unknown"
|
| 562 |
if country_explanation: country_explanation = "-"+country_explanation
|
|
|
|
| 597 |
else:
|
| 598 |
if len(method_used + sample_type_explanation)> 0:
|
| 599 |
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
| 600 |
+
# last resort: combine all information to give all output otherwise unknown
|
| 601 |
+
if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
|
| 602 |
+
text = ""
|
| 603 |
+
for key in meta_expand:
|
| 604 |
+
text += str(key) + ": " + meta_expand[key] + "\n"
|
| 605 |
+
if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
|
| 606 |
+
text += data_preprocess.normalize_for_overlap(all_output)
|
| 607 |
+
if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
|
| 608 |
+
text += data_preprocess.normalize_for_overlap(chunk)
|
| 609 |
+
text += ". NCBI Features: " + features
|
| 610 |
+
print("this is text for the last resort model")
|
| 611 |
+
print(text)
|
| 612 |
+
country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
|
| 613 |
+
primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
|
| 614 |
+
model.call_llm_api, chunk=text, all_output=text)
|
| 615 |
+
print("this is last resort results: ")
|
| 616 |
+
print("country: ", country)
|
| 617 |
+
print("sample type: ", sample_type)
|
| 618 |
+
if len(country) == 0: country = "unknown"
|
| 619 |
+
if len(sample_type) == 0: sample_type = "unknown"
|
| 620 |
+
if country_explanation: country_explanation = "-"+country_explanation
|
| 621 |
+
else: country_explanation = ""
|
| 622 |
+
if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
|
| 623 |
+
else: sample_type_explanation = ""
|
| 624 |
+
if method_used == "unknown": method_used = ""
|
| 625 |
+
if country.lower() != "unknown":
|
| 626 |
+
stand_country = standardize_location.smart_country_lookup(country.lower())
|
| 627 |
+
if stand_country.lower() != "not found":
|
| 628 |
+
if stand_country.lower() in acc_score["country"]:
|
| 629 |
+
if country_explanation:
|
| 630 |
+
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
|
| 631 |
+
else:
|
| 632 |
+
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
|
| 633 |
+
else:
|
| 634 |
+
if country.lower() in acc_score["country"]:
|
| 635 |
+
if country_explanation:
|
| 636 |
+
if len(method_used + country_explanation) > 0:
|
| 637 |
+
acc_score["country"][country.lower()].append(method_used + country_explanation)
|
| 638 |
+
else:
|
| 639 |
+
if len(method_used + country_explanation) > 0:
|
| 640 |
+
acc_score["country"][country.lower()] = [method_used + country_explanation]
|
| 641 |
+
if sample_type.lower() != "unknown":
|
| 642 |
+
if sample_type.lower() in acc_score["sample_type"]:
|
| 643 |
+
if len(method_used + sample_type_explanation) > 0:
|
| 644 |
+
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
|
| 645 |
+
else:
|
| 646 |
+
if len(method_used + sample_type_explanation)> 0:
|
| 647 |
+
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
| 648 |
end = time.time()
|
| 649 |
total_cost_title += total_query_cost
|
| 650 |
acc_score["query_cost"] = f"{total_cost_title:.6f}"
|
smart_fallback.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Bio import Entrez, Medline
|
| 2 |
+
import model
|
| 3 |
+
import mtdna_classifier
|
| 4 |
+
# Setup
|
| 5 |
+
def fetch_ncbi(accession_number):
|
| 6 |
+
Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
|
| 7 |
+
handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
|
| 8 |
+
record = Entrez.read(handle)
|
| 9 |
+
handle.close()
|
| 10 |
+
outputs = {"authors":"unknown",
|
| 11 |
+
"institution":"unknown",
|
| 12 |
+
"isolate":"unknown",
|
| 13 |
+
"definition":"unknown",
|
| 14 |
+
"title":"unknown",
|
| 15 |
+
"seq_comment":"unknown",
|
| 16 |
+
"collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
|
| 17 |
+
gb_seq = None
|
| 18 |
+
try:
|
| 19 |
+
# Validate record structure: It should be a list with at least one element (a dict)
|
| 20 |
+
if isinstance(record, list) and len(record) > 0:
|
| 21 |
+
if isinstance(record[0], dict):
|
| 22 |
+
gb_seq = record[0]
|
| 23 |
+
else:
|
| 24 |
+
print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
|
| 25 |
+
# extract collection date
|
| 26 |
+
if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
|
| 27 |
+
outputs["collection_date"] = gb_seq["GBSeq_create-date"]
|
| 28 |
+
else:
|
| 29 |
+
if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
|
| 30 |
+
outputs["collection_date"] = gb_seq["GBSeq_update-date"]
|
| 31 |
+
# extract definition
|
| 32 |
+
if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
|
| 33 |
+
outputs["definition"] = gb_seq["GBSeq_definition"]
|
| 34 |
+
# extract related-reference things
|
| 35 |
+
if "GBSeq_references" in gb_seq:
|
| 36 |
+
for ref in gb_seq["GBSeq_references"]:
|
| 37 |
+
# extract authors
|
| 38 |
+
if "GBReference_authors" in ref and outputs["authors"]=="unknown":
|
| 39 |
+
outputs["authors"] = "and ".join(ref["GBReference_authors"])
|
| 40 |
+
# extract title
|
| 41 |
+
if "GBReference_title" in ref and outputs["title"]=="unknown":
|
| 42 |
+
outputs["title"] = ref["GBReference_title"]
|
| 43 |
+
# extract submitted journal
|
| 44 |
+
if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
|
| 45 |
+
outputs["institution"] = ref['GBReference_journal']
|
| 46 |
+
# extract seq_comment
|
| 47 |
+
if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
|
| 48 |
+
outputs["seq_comment"] = gb_seq["GBSeq_comment"]
|
| 49 |
+
# extract isolate
|
| 50 |
+
if "GBSeq_feature-table" in gb_seq:
|
| 51 |
+
if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
|
| 52 |
+
for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
|
| 53 |
+
if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
|
| 54 |
+
outputs["isolate"] = ref["GBQualifier_value"]
|
| 55 |
+
else:
|
| 56 |
+
print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
|
| 57 |
+
|
| 58 |
+
# If gb_seq is still None, return defaults
|
| 59 |
+
if gb_seq is None:
|
| 60 |
+
return {"authors":"unknown",
|
| 61 |
+
"institution":"unknown",
|
| 62 |
+
"isolate":"unknown",
|
| 63 |
+
"definition":"unknown",
|
| 64 |
+
"title":"unknown",
|
| 65 |
+
"seq_comment":"unknown",
|
| 66 |
+
"collection_date":"unknown" }
|
| 67 |
+
return outputs
|
| 68 |
+
except:
|
| 69 |
+
print("error in fetching ncbi data")
|
| 70 |
+
return {"authors":"unknown",
|
| 71 |
+
"institution":"unknown",
|
| 72 |
+
"isolate":"unknown",
|
| 73 |
+
"definition":"unknown",
|
| 74 |
+
"title":"unknown",
|
| 75 |
+
"seq_comment":"unknown",
|
| 76 |
+
"collection_date":"unknown" }
|
| 77 |
+
# Method 1: Smarter Google
|
| 78 |
+
def smart_google_queries(metadata: dict):
|
| 79 |
+
queries = []
|
| 80 |
+
|
| 81 |
+
# Extract useful fields
|
| 82 |
+
isolate = metadata.get("isolate")
|
| 83 |
+
author = metadata.get("authors")
|
| 84 |
+
institution = metadata.get("institution")
|
| 85 |
+
title = metadata.get("title")
|
| 86 |
+
print(title)
|
| 87 |
+
combined = []
|
| 88 |
+
# Construct queries
|
| 89 |
+
if isolate:
|
| 90 |
+
queries.append(f'"{isolate}" mitochondrial DNA')
|
| 91 |
+
queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
|
| 92 |
+
|
| 93 |
+
if author:
|
| 94 |
+
try:
|
| 95 |
+
author_name = author.split(',')[0] # Use last name only
|
| 96 |
+
except:
|
| 97 |
+
author_name = author
|
| 98 |
+
queries.append(f'"{author_name}" mitochondrial DNA')
|
| 99 |
+
queries.append(f'"{author_name}" mtDNA site:researchgate.net')
|
| 100 |
+
|
| 101 |
+
if institution:
|
| 102 |
+
try:
|
| 103 |
+
short_inst = institution.split(',')[0] # Take first part of institution
|
| 104 |
+
except:
|
| 105 |
+
short_inst = institution
|
| 106 |
+
queries.append(f'"{short_inst}" mtDNA sequence')
|
| 107 |
+
queries.append(f'"{short_inst}" isolate site:nature.com')
|
| 108 |
+
queries.append(title)
|
| 109 |
+
return queries
|
| 110 |
+
|
| 111 |
+
def filter_links_by_metadata(search_results):
|
| 112 |
+
TRUSTED_DOMAINS = [
|
| 113 |
+
"ncbi.nlm.nih.gov",
|
| 114 |
+
"pubmed.ncbi.nlm.nih.gov",
|
| 115 |
+
"pmc.ncbi.nlm.nih.gov",
|
| 116 |
+
"biorxiv.org",
|
| 117 |
+
"researchgate.net",
|
| 118 |
+
"nature.com",
|
| 119 |
+
"sciencedirect.com"
|
| 120 |
+
]
|
| 121 |
+
def is_trusted_link(link):
|
| 122 |
+
for domain in TRUSTED_DOMAINS:
|
| 123 |
+
if domain in link:
|
| 124 |
+
return True
|
| 125 |
+
return False
|
| 126 |
+
def is_relevant_title_snippet(link):
|
| 127 |
+
keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
|
| 128 |
+
title_snippet = link.lower()
|
| 129 |
+
for keyword in keywords:
|
| 130 |
+
if keyword in title_snippet:
|
| 131 |
+
return True
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
filtered = []
|
| 135 |
+
if len(search_results) > 0:
|
| 136 |
+
for link in search_results:
|
| 137 |
+
if is_trusted_link(link) and link not in filtered:
|
| 138 |
+
filtered.append(link)
|
| 139 |
+
if is_relevant_title_snippet(link) and link not in filtered:
|
| 140 |
+
filtered.append(link)
|
| 141 |
+
return filtered
|
| 142 |
+
|
| 143 |
+
def smart_google_search(metadata):
|
| 144 |
+
queries = smart_google_queries(metadata)
|
| 145 |
+
links = []
|
| 146 |
+
for q in queries:
|
| 147 |
+
#print("\n🔍 Query:", q)
|
| 148 |
+
results = mtdna_classifier.search_google_custom(q,2)
|
| 149 |
+
for link in results:
|
| 150 |
+
#print(f"- {link}")
|
| 151 |
+
if link not in links:
|
| 152 |
+
links.append(link)
|
| 153 |
+
filter_links = filter_links_by_metadata(links)
|
| 154 |
+
return filter_links
|
| 155 |
+
# Method 2: Prompt LLM better or better ai search api with all
|
| 156 |
+
# the total information from even ncbi and all search
|