Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 9 days ago

Commit

04a4324

verified ·

1 Parent(s): 172369b

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +142 -22

pipeline.py CHANGED Viewed

@@ -304,8 +304,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
       acc_score = { "isolate": "",
                     "country":{},
                    "sample_type":{},
-                   #"specific_location":{},
-                   #"ethnicity":{},
                    "query_cost":total_cost_title,
                    "time_cost":None,
                    "source":links,
@@ -395,10 +393,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
         if stand_country.lower() != "not found":
           acc_score["country"][stand_country.lower()] = ["ncbi"]
         else: acc_score["country"][country.lower()] = ["ncbi"]
-      # if spe_loc.lower() != "unknown":
-      #   acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
-      # if ethnic.lower() != "unknown":
-      #   acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
       if sample_type.lower() != "unknown":
         acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
       # second way: LLM model
@@ -446,7 +440,44 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                   print(f"✅ CrossRef metadata fetched for {link}")
                   other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
                   article_text = html.mergeTextInJson(metadata_text)
             if article_text:
               if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                 out_links[link] = article_text
@@ -460,14 +491,10 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
           if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
             # might find the article
             print("no article text, start tem link")
-            #tem_links = mtdna_classifier.search_google_custom(title, 2)
             tem_links = smart_fallback.smart_google_search(meta_expand)
             print("tem links: ", tem_links)
-            tem_link_acc = smart_fallback.google_accession_search(acc)
-            tem_links += tem_link_acc
             tem_links = unique_preserve_order(tem_links)
             print("tem link before filtering: ", tem_links)
-            # filter the quality link
             print("saveLinkFolder as sample folder id: ", sample_folder_id)
             print("start the smart filter link")
             if stop_flag is not None and stop_flag.value:
@@ -548,6 +575,43 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                           print(f"✅ CrossRef metadata fetched for {link}")
                           other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
                           article_text = html.mergeTextInJson(metadata_text)
                     if article_text:
                       if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                         out_links[link] = article_text
@@ -564,8 +628,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                     #tem_links = mtdna_classifier.search_google_custom(title, 2)
                     tem_links = smart_fallback.smart_google_search(meta_expand)
                     print("tem links: ", tem_links)
-                    tem_link_acc = smart_fallback.google_accession_search(acc)
-                    tem_links += tem_link_acc
                     tem_links = unique_preserve_order(tem_links)
                     print("tem link before filtering: ", tem_links)
                     # filter the quality link
@@ -607,6 +669,42 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                           print(f"✅ CrossRef metadata fetched for {link}")
                           other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
                           article_text = html.mergeTextInJson(metadata_text)
                     if article_text:
                       if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                         out_links[link] = article_text
@@ -623,8 +721,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                     #tem_links = mtdna_classifier.search_google_custom(title, 2)
                     tem_links = smart_fallback.smart_google_search(meta_expand)
                     print("tem links: ", tem_links)
-                    tem_link_acc = smart_fallback.google_accession_search(acc)
-                    tem_links += tem_link_acc
                     tem_links = unique_preserve_order(tem_links)
                     print("tem link before filtering: ", tem_links)
                     # filter the quality link
@@ -682,14 +778,33 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
         if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
         if len(all_output) > 1*1000*1000:
           all_output = data_preprocess.normalize_for_overlap(all_output)
-          if len(all_output) > 1*1000*1000:
-            all_output = all_output[:1000000]
         if len(chunk) > 1*1000*1000:
           chunk = data_preprocess.normalize_for_overlap(chunk)
-          if len(chunk) > 1*1000*1000:
-            chunk = chunk[:1*1000*1000]
         print("chunk len: ", len(chunk))
-        print("all output len: ", len(all_output))
         data_preprocess.save_text_to_docx(chunk, file_chunk_path)
         data_preprocess.save_text_to_docx(all_output, file_all_path)
         # Later when saving new files
@@ -723,13 +838,18 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
         print("this is text for the last resort model")
         print(text)
-        predicted_outputs, method_used, total_query_cost = model.query_document_info(
           niche_cases=niche_cases,
           query_word=primary_word, alternative_query_word=alternative_word,
           metadata=meta,
           master_structured_lookup=None, faiss_index=None, document_chunks=None,
           llm_api_function=model.call_llm_api, chunk=text, all_output=text)
-        print("this is last resort results: ")
         for pred_out in predicted_outputs:
             # only for country, we have to standardize
             if pred_out == "country_name":

       acc_score = { "isolate": "",
                     "country":{},
                    "sample_type":{},
                    "query_cost":total_cost_title,
                    "time_cost":None,
                    "source":links,
         if stand_country.lower() != "not found":
           acc_score["country"][stand_country.lower()] = ["ncbi"]
         else: acc_score["country"][country.lower()] = ["ncbi"]
       if sample_type.lower() != "unknown":
         acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
       # second way: LLM model
                   print(f"✅ CrossRef metadata fetched for {link}")
                   other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
                   article_text = html.mergeTextInJson(metadata_text)
+                # also try searching pubmed with the title and extract abstract and add to article text
+                # Step 1: Search for the paper
+                print("search the paper's abstract on pubmed")
+                handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
+                record = Entrez.read(handle)
+                id_list = record.get("IdList", [])
+                if not id_list:
+                    print("No PubMed results found.")
+                else:
+                    pubmed_id = id_list[0]
+                    fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
+                    fetch_record = Entrez.read(fetch_handle)
+                    # Safe extraction
+                    article = fetch_record.get("PubmedArticle", [])
+                    if not article:
+                        print("No PubmedArticle entry returned.")
+                    else:
+                        article = article[0]  # the real payload
+                        try:
+                            abstract_sections = (
+                                article["MedlineCitation"]["Article"]
+                                .get("Abstract", {})
+                                .get("AbstractText", [])
+                            )
+                            full_abstract = " ".join(str(s) for s in abstract_sections)
+                            if full_abstract.strip():
+                                print("Abstract found (len={}):".format(len(full_abstract)))
+                                #print(full_abstract)
+                                article_text += full_abstract
+                            else:
+                                print("This article has **no abstract available on PubMed**.")
+                        except KeyError:
+                            print("Abstract field missing in this PubMed record.")
             if article_text:
               if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                 out_links[link] = article_text
           if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
             # might find the article
             print("no article text, start tem link")
             tem_links = smart_fallback.smart_google_search(meta_expand)
             print("tem links: ", tem_links)
             tem_links = unique_preserve_order(tem_links)
             print("tem link before filtering: ", tem_links)
             print("saveLinkFolder as sample folder id: ", sample_folder_id)
             print("start the smart filter link")
             if stop_flag is not None and stop_flag.value:
                           print(f"✅ CrossRef metadata fetched for {link}")
                           other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
                           article_text = html.mergeTextInJson(metadata_text)
+                        # Step 1: Search for the paper
+                        print("search the paper's abstract on pubmed")
+                        handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
+                        record = Entrez.read(handle)
+                        id_list = record.get("IdList", [])
+                        if not id_list:
+                            print("No PubMed results found.")
+                        else:
+                            pubmed_id = id_list[0]
+                            fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
+                            fetch_record = Entrez.read(fetch_handle)
+                            # Safe extraction
+                            article = fetch_record.get("PubmedArticle", [])
+                            if not article:
+                                print("No PubmedArticle entry returned.")
+                            else:
+                                article = article[0]  # the real payload
+                                try:
+                                    abstract_sections = (
+                                        article["MedlineCitation"]["Article"]
+                                        .get("Abstract", {})
+                                        .get("AbstractText", [])
+                                    )
+                                    full_abstract = " ".join(str(s) for s in abstract_sections)
+                                    if full_abstract.strip():
+                                        print("Abstract found (len={}):".format(len(full_abstract)))
+                                        #print(full_abstract)
+                                        article_text += full_abstract
+                                    else:
+                                        print("This article has **no abstract available on PubMed**.")
+                                except KeyError:
+                                    print("Abstract field missing in this PubMed record.")
                     if article_text:
                       if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                         out_links[link] = article_text
                     #tem_links = mtdna_classifier.search_google_custom(title, 2)
                     tem_links = smart_fallback.smart_google_search(meta_expand)
                     print("tem links: ", tem_links)
                     tem_links = unique_preserve_order(tem_links)
                     print("tem link before filtering: ", tem_links)
                     # filter the quality link
                           print(f"✅ CrossRef metadata fetched for {link}")
                           other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
                           article_text = html.mergeTextInJson(metadata_text)
+                        # Step 1: Search for the paper
+                        print("search the paper's abstract on pubmed")
+                        handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
+                        record = Entrez.read(handle)
+                        id_list = record.get("IdList", [])
+                        if not id_list:
+                            print("No PubMed results found.")
+                        else:
+                            pubmed_id = id_list[0]
+                            fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
+                            fetch_record = Entrez.read(fetch_handle)
+                            # Safe extraction
+                            article = fetch_record.get("PubmedArticle", [])
+                            if not article:
+                                print("No PubmedArticle entry returned.")
+                            else:
+                                article = article[0]  # the real payload
+                                try:
+                                    abstract_sections = (
+                                        article["MedlineCitation"]["Article"]
+                                        .get("Abstract", {})
+                                        .get("AbstractText", [])
+                                    )
+                                    full_abstract = " ".join(str(s) for s in abstract_sections)
+                                    if full_abstract.strip():
+                                        print("Abstract found (len={}):".format(len(full_abstract)))
+                                        #print(full_abstract)
+                                        article_text += full_abstract
+                                    else:
+                                        print("This article has **no abstract available on PubMed**.")
+                                except KeyError:
+                                    print("Abstract field missing in this PubMed record.")
                     if article_text:
                       if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                         out_links[link] = article_text
                     #tem_links = mtdna_classifier.search_google_custom(title, 2)
                     tem_links = smart_fallback.smart_google_search(meta_expand)
                     print("tem links: ", tem_links)
                     tem_links = unique_preserve_order(tem_links)
                     print("tem link before filtering: ", tem_links)
                     # filter the quality link
         if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
         if len(all_output) > 1*1000*1000:
           all_output = data_preprocess.normalize_for_overlap(all_output)
         if len(chunk) > 1*1000*1000:
           chunk = data_preprocess.normalize_for_overlap(chunk)
         print("chunk len: ", len(chunk))
+        print("all output len: ", len(all_output))
+        # use build context for llm function to reduce token
+        reduce_context_for_llm = ""
+        if len(all_output)>900000 or len(chunk)>900000:
+          texts_reduce = []
+          out_links_reduce = {}
+          if links:
+            for link in links:
+              all_output_reduce, chunk_reduce, context_reduce = "", "",""
+              context_reduce, all_output_reduce, chunk_reduce = await process_link_chunk_allOutput(link,
+                        iso, acc, sample_folder_id, out_links_reduce,
+                        all_output_reduce, chunk_reduce)
+              texts_reduce.append(all_output_reduce)
+              out_links_reduce[link] = {"all_output": all_output_reduce}
+            input_prompt = ["country_name", "modern/ancient/unknown"]
+            if niche_cases: input_prompt += niche_cases
+            reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt)
+          if reduce_context_for_llm:
+            print("reduce context for llm")
+            all_output = reduce_context_for_llm
+          else:
+            print("reduce context no succeed")
+            all_output = all_output[:900000]
         data_preprocess.save_text_to_docx(chunk, file_chunk_path)
         data_preprocess.save_text_to_docx(all_output, file_all_path)
         # Later when saving new files
         print("this is text for the last resort model")
         print(text)
+        predicted_outputs, method_used, total_query_cost, more_links = await model.query_document_info(
           niche_cases=niche_cases,
           query_word=primary_word, alternative_query_word=alternative_word,
+          saveLinkFolder = sample_folder_id,
           metadata=meta,
           master_structured_lookup=None, faiss_index=None, document_chunks=None,
           llm_api_function=model.call_llm_api, chunk=text, all_output=text)
+        print("add more links from model.query document")
+        if more_links:
+          links += more_links
+          acc_score["source"] = links
+        print("this is llm results: ")
         for pred_out in predicted_outputs:
             # only for country, we have to standardize
             if pred_out == "country_name":