Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on Jul 22

Commit

0767cb0

verified ·

1 Parent(s): d79f1bd

Upload 3 files

Browse files

Files changed (3) hide show

mtdna_backend.py +32 -21
pipeline.py +82 -8
smart_fallback.py +156 -0

mtdna_backend.py CHANGED Viewed

@@ -139,12 +139,12 @@ def summarize_results(accession):
     if cached:
         print(f"✅ Using cached result for {accession}")
         return [[
-            cached["Sample ID"],
-            cached["Predicted Country"],
-            cached["Country Explanation"],
-            cached["Predicted Sample Type"],
-            cached["Sample Type Explanation"],
-            cached["Sources"],
             cached["Time cost"]
         ]]
     # only run when nothing in the cache
@@ -175,13 +175,15 @@ def summarize_results(accession):
         pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
         for section, results in outputs[key].items():
           if section == "country" or section =="sample_type":
-            pred_output = "\n".join(list(results.keys()))
             output_explanation = ""
             for result, content in results.items():
               if len(result) == 0:  result = "unknown"
               if len(content) == 0: output_explanation = "unknown"
               else:
                 output_explanation += 'Method: ' + "\nMethod: ".join(content)  + "\n"
             if section == "country":
               pred_country, country_explanation = pred_output, output_explanation
             elif section == "sample_type":
@@ -191,24 +193,24 @@ def summarize_results(accession):
           else: label = key
         if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
         row = {
-            "Sample ID": label,
-            "Predicted Country": pred_country,
-            "Country Explanation": country_explanation,
-            "Predicted Sample Type":pred_sample,
-            "Sample Type Explanation":sample_explanation,
-            "Sources": "\n".join(outputs[key]["source"]),
             "Time cost": outputs[key]["time_cost"]
         }
         #row_score.append(row)
         rows.append(list(row.values()))
         save_row = {
-            "Sample ID": label,
-            "Predicted Country": pred_country,
-            "Country Explanation": country_explanation,
-            "Predicted Sample Type":pred_sample,
-            "Sample Type Explanation":sample_explanation,
-            "Sources": "\n".join(outputs[key]["source"]),
             "Query_cost": outputs[key]["query_cost"],
             "Time cost": outputs[key]["time_cost"]
         }
@@ -530,14 +532,23 @@ def check_known_output(accession):
         if "Sample ID" not in df.columns:
             print("❌ Column 'Sample ID' not found in Google Sheet.")
             return None
         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
         if match:
             accession = match.group(0)
         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
         if not matched.empty:
-            return matched.iloc[0].to_dict()
         else:
             print(f"🔍 Accession {accession} not found in known_samples.")
             return None

     if cached:
         print(f"✅ Using cached result for {accession}")
         return [[
+            cached["Sample ID"] or "unknown",
+            cached["Predicted Country"] or "unknown",
+            cached["Country Explanation"] or "unknown",
+            cached["Predicted Sample Type"] or "unknown",
+            cached["Sample Type Explanation"] or "unknown",
+            cached["Sources"] or "No Links",
             cached["Time cost"]
         ]]
     # only run when nothing in the cache
         pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
         for section, results in outputs[key].items():
           if section == "country" or section =="sample_type":
+            pred_output = []#"\n".join(list(results.keys()))
             output_explanation = ""
             for result, content in results.items():
               if len(result) == 0:  result = "unknown"
               if len(content) == 0: output_explanation = "unknown"
               else:
                 output_explanation += 'Method: ' + "\nMethod: ".join(content)  + "\n"
+              pred_output.append(result)
+            pred_output = "\n".join(pred_output)
             if section == "country":
               pred_country, country_explanation = pred_output, output_explanation
             elif section == "sample_type":
           else: label = key
         if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
         row = {
+            "Sample ID": label or "unknown",
+            "Predicted Country": pred_country or "unknown",
+            "Country Explanation": country_explanation or "unknown",
+            "Predicted Sample Type":pred_sample or "unknown",
+            "Sample Type Explanation":sample_explanation or "unknown",
+            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
             "Time cost": outputs[key]["time_cost"]
         }
         #row_score.append(row)
         rows.append(list(row.values()))
         save_row = {
+            "Sample ID": label or "unknown",
+            "Predicted Country": pred_country or "unknown",
+            "Country Explanation": country_explanation or "unknown",
+            "Predicted Sample Type":pred_sample or "unknown",
+            "Sample Type Explanation":sample_explanation or "unknown",
+            "Sources": "\n".join(outputs[key]["source"]) or "No Links",
             "Query_cost": outputs[key]["query_cost"],
             "Time cost": outputs[key]["time_cost"]
         }
         if "Sample ID" not in df.columns:
             print("❌ Column 'Sample ID' not found in Google Sheet.")
             return None
         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
         if match:
             accession = match.group(0)
         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
         if not matched.empty:
+            #return matched.iloc[0].to_dict()
+            row = matched.iloc[0]
+            country = row.get("Predicted Country", "").strip().lower()
+            sample_type = row.get("Predicted Sample Type", "").strip().lower()
+            if country and country != "unknown" and sample_type and sample_type != "unknown":
+                return row.to_dict()
+            else:
+                print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
+                return None
         else:
             print(f"🔍 Accession {accession} not found in known_samples.")
             return None

pipeline.py CHANGED Viewed

@@ -6,6 +6,7 @@ import data_preprocess
 import model
 import mtdna_classifier
 #import app
 import pandas as pd
 from pathlib import Path
 import subprocess
@@ -27,7 +28,7 @@ import io
 import json
 #––– Authentication setup –––
 GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
-GDRIVE_DATA_FOLDER_NAME = "data"
 GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
 GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
 drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
@@ -216,13 +217,18 @@ def pipeline_with_gemini(accessions):
       country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
       acc_score["isolate"] = iso
       print(meta)
       # set up step: create the folder to save document
       chunk, all_output = "",""
       if pudID:
         id = str(pudID)
         saveTitle = title
       else:
-        saveTitle = title + "_" + col_date
         id = "DirectSubmission"
       # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
       # if not folder_path.exists():
@@ -232,10 +238,13 @@ def pipeline_with_gemini(accessions):
       # else:
       #     print("data/"+str(id) +" already exists.")
       # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
-      parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
-      data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
       sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
       # Define document names
       if len(saveTitle) > 50:
           saveName = saveTitle[:50]
@@ -264,6 +273,14 @@ def pipeline_with_gemini(accessions):
         print("✅ Files already exist in Google Drive. Downloading them...")
         chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
         all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
         # Read and parse these into `chunk` and `all_output`
       else:
         # 🔥 Remove any stale local copies
@@ -321,7 +338,8 @@ def pipeline_with_gemini(accessions):
       if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
         # might find the article
         print("no article text")
-        tem_links = mtdna_classifier.search_google_custom(title, 2)
         # get supplementary of that article
         print("tem links length ", len(tem_links))
         for link in tem_links:
@@ -436,8 +454,10 @@ def pipeline_with_gemini(accessions):
         # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
         # Upload to Drive
-        upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
-        upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
         print("here 1")
       # else:
@@ -528,9 +548,15 @@ def pipeline_with_gemini(accessions):
       # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
       #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
       #     model.call_llm_api, chunk=chunk, all_output=all_output)
       country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
           primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
           model.call_llm_api, chunk=chunk, all_output=all_output)
       if len(country) == 0: country = "unknown"
       if len(sample_type) == 0: sample_type = "unknown"
       if country_explanation: country_explanation = "-"+country_explanation
@@ -571,6 +597,54 @@ def pipeline_with_gemini(accessions):
         else:
           if len(method_used + sample_type_explanation)> 0:
             acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
       end = time.time()
       total_cost_title += total_query_cost
       acc_score["query_cost"] = f"{total_cost_title:.6f}"

 import model
 import mtdna_classifier
 #import app
+import smart_fallback
 import pandas as pd
 from pathlib import Path
 import subprocess
 import json
 #––– Authentication setup –––
 GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
+GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
 GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
 GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
 drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
       country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
       acc_score["isolate"] = iso
       print(meta)
+      meta_expand = smart_fallback.fetch_ncbi(acc)
       # set up step: create the folder to save document
       chunk, all_output = "",""
       if pudID:
         id = str(pudID)
         saveTitle = title
       else:
+        try:
+          author_name = meta_expand["authors"].split(',')[0]  # Use last name only
+        except:
+          author_name = meta_expand["authors"]
+        saveTitle = title + "_" + col_date + "_" + author_name
         id = "DirectSubmission"
       # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
       # if not folder_path.exists():
       # else:
       #     print("data/"+str(id) +" already exists.")
       # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
+      # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
+      # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
+      # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      data_folder_id = GDRIVE_DATA_FOLDER_NAME  # Use the shared folder directly
       sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
+      print("sample folder id: ", sample_folder_id)
       # Define document names
       if len(saveTitle) > 50:
           saveName = saveTitle[:50]
         print("✅ Files already exist in Google Drive. Downloading them...")
         chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
         all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
+        print("chunk_id and all_id: ")
+        print(chunk_id, all_id)
+        file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
+        print("📄 Name:", file["name"])
+        print("📁 Parent folder ID:", file["parents"][0])
+        print("🔗 View link:", file["webViewLink"])
         # Read and parse these into `chunk` and `all_output`
       else:
         # 🔥 Remove any stale local copies
       if len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
         # might find the article
         print("no article text")
+        #tem_links = mtdna_classifier.search_google_custom(title, 2)
+        tem_links = smart_fallback.smart_google_search(meta_expand)
         # get supplementary of that article
         print("tem links length ", len(tem_links))
         for link in tem_links:
         # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
         # Upload to Drive
+        result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
+        result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
+        print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
+        print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
         print("here 1")
       # else:
       # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
       #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
       #     model.call_llm_api, chunk=chunk, all_output=all_output)
+      print("this is chunk for the model")
+      print(chunk)
+      print("this is all output for the model")
+      print(all_output)
       country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
           primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
           model.call_llm_api, chunk=chunk, all_output=all_output)
+      print("country using ai: ", country)
+      print("sample type using ai: ", sample_type)
       if len(country) == 0: country = "unknown"
       if len(sample_type) == 0: sample_type = "unknown"
       if country_explanation: country_explanation = "-"+country_explanation
         else:
           if len(method_used + sample_type_explanation)> 0:
             acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
+      # last resort: combine all information to give all output otherwise unknown
+      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0:
+        text = ""
+        for key in meta_expand:
+          text += str(key) + ": " + meta_expand[key] + "\n"
+        if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
+          text += data_preprocess.normalize_for_overlap(all_output)
+        if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
+          text += data_preprocess.normalize_for_overlap(chunk)
+        text += ". NCBI Features: " + features
+        print("this is text for the last resort model")
+        print(text)
+        country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
+            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
+            model.call_llm_api, chunk=text, all_output=text)
+        print("this is last resort results: ")
+        print("country: ", country)
+        print("sample type: ", sample_type)
+        if len(country) == 0: country = "unknown"
+        if len(sample_type) == 0: sample_type = "unknown"
+        if country_explanation: country_explanation = "-"+country_explanation
+        else: country_explanation = ""
+        if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
+        else: sample_type_explanation = ""
+        if method_used == "unknown": method_used = ""
+        if country.lower() != "unknown":
+          stand_country = standardize_location.smart_country_lookup(country.lower())
+          if stand_country.lower() != "not found":
+            if stand_country.lower() in acc_score["country"]:
+              if country_explanation:
+                acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
+            else:
+              acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
+          else:
+            if country.lower() in acc_score["country"]:
+              if country_explanation:
+                if len(method_used + country_explanation) > 0:
+                  acc_score["country"][country.lower()].append(method_used + country_explanation)
+            else:
+              if len(method_used + country_explanation) > 0:
+                acc_score["country"][country.lower()] = [method_used + country_explanation]
+        if sample_type.lower() != "unknown":
+            if sample_type.lower() in acc_score["sample_type"]:
+              if len(method_used + sample_type_explanation) > 0:
+                acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
+            else:
+              if len(method_used + sample_type_explanation)> 0:
+                acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
       end = time.time()
       total_cost_title += total_query_cost
       acc_score["query_cost"] = f"{total_cost_title:.6f}"

smart_fallback.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from Bio import Entrez, Medline
+import model
+import mtdna_classifier
+# Setup
+def fetch_ncbi(accession_number):
+  Entrez.email = "[email protected]" # Required by NCBI, REPLACE WITH YOUR EMAIL
+  handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
+  record = Entrez.read(handle)
+  handle.close()
+  outputs = {"authors":"unknown",
+            "institution":"unknown",
+            "isolate":"unknown",
+            "definition":"unknown",
+            "title":"unknown",
+            "seq_comment":"unknown",
+            "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
+  gb_seq = None
+  try:
+    # Validate record structure: It should be a list with at least one element (a dict)
+    if isinstance(record, list) and len(record) > 0:
+        if isinstance(record[0], dict):
+            gb_seq = record[0]
+        else:
+            print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
+        # extract collection date
+        if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
+          outputs["collection_date"] = gb_seq["GBSeq_create-date"]
+        else:
+          if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
+            outputs["collection_date"] = gb_seq["GBSeq_update-date"]
+        # extract definition
+        if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
+          outputs["definition"] = gb_seq["GBSeq_definition"]
+        # extract related-reference things
+        if "GBSeq_references" in gb_seq:
+          for ref in gb_seq["GBSeq_references"]:
+            # extract authors
+            if "GBReference_authors" in ref and outputs["authors"]=="unknown":
+              outputs["authors"] = "and ".join(ref["GBReference_authors"])
+            # extract title
+            if "GBReference_title" in ref and outputs["title"]=="unknown":
+              outputs["title"] = ref["GBReference_title"]
+            #  extract submitted journal
+            if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
+              outputs["institution"] = ref['GBReference_journal']
+        # extract seq_comment
+        if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
+          outputs["seq_comment"] = gb_seq["GBSeq_comment"]
+        # extract isolate
+        if "GBSeq_feature-table" in gb_seq:
+          if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
+            for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
+              if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
+                outputs["isolate"] = ref["GBQualifier_value"]
+    else:
+        print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
+    # If gb_seq is still None, return defaults
+    if gb_seq is None:
+        return {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" }
+    return outputs
+  except:
+    print("error in fetching ncbi data")
+    return {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" }
+# Method 1: Smarter Google
+def smart_google_queries(metadata: dict):
+    queries = []
+    # Extract useful fields
+    isolate = metadata.get("isolate")
+    author = metadata.get("authors")
+    institution = metadata.get("institution")
+    title = metadata.get("title")
+    print(title)
+    combined = []
+    # Construct queries
+    if isolate:
+        queries.append(f'"{isolate}" mitochondrial DNA')
+        queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
+    if author:
+        try:
+          author_name = author.split(',')[0]  # Use last name only
+        except:
+          author_name = author
+        queries.append(f'"{author_name}" mitochondrial DNA')
+        queries.append(f'"{author_name}" mtDNA site:researchgate.net')
+    if institution:
+        try:
+          short_inst = institution.split(',')[0]  # Take first part of institution
+        except:
+          short_inst = institution
+        queries.append(f'"{short_inst}" mtDNA sequence')
+        queries.append(f'"{short_inst}" isolate site:nature.com')
+    queries.append(title)
+    return queries
+def filter_links_by_metadata(search_results):
+    TRUSTED_DOMAINS = [
+    "ncbi.nlm.nih.gov",
+    "pubmed.ncbi.nlm.nih.gov",
+    "pmc.ncbi.nlm.nih.gov",
+    "biorxiv.org",
+    "researchgate.net",
+    "nature.com",
+    "sciencedirect.com"
+    ]
+    def is_trusted_link(link):
+      for domain in TRUSTED_DOMAINS:
+        if domain in link:
+          return True
+      return False
+    def is_relevant_title_snippet(link):
+      keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
+      title_snippet = link.lower()
+      for keyword in keywords:
+        if keyword in title_snippet:
+          return True
+      return False
+    filtered = []
+    if len(search_results) > 0:
+      for link in search_results:
+          if is_trusted_link(link) and link not in filtered:
+              filtered.append(link)
+          if is_relevant_title_snippet(link) and link not in filtered:
+              filtered.append(link)
+    return filtered
+def smart_google_search(metadata):
+  queries = smart_google_queries(metadata)
+  links = []
+  for q in queries:
+      #print("\n🔍 Query:", q)
+      results = mtdna_classifier.search_google_custom(q,2)
+      for link in results:
+          #print(f"- {link}")
+          if link not in links:
+              links.append(link)
+  filter_links = filter_links_by_metadata(links)
+  return filter_links
+# Method 2: Prompt LLM better or better ai search api with all
+# the total information from even ncbi and all search