Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +142 -22
pipeline.py
CHANGED
|
@@ -304,8 +304,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 304 |
acc_score = { "isolate": "",
|
| 305 |
"country":{},
|
| 306 |
"sample_type":{},
|
| 307 |
-
#"specific_location":{},
|
| 308 |
-
#"ethnicity":{},
|
| 309 |
"query_cost":total_cost_title,
|
| 310 |
"time_cost":None,
|
| 311 |
"source":links,
|
|
@@ -395,10 +393,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 395 |
if stand_country.lower() != "not found":
|
| 396 |
acc_score["country"][stand_country.lower()] = ["ncbi"]
|
| 397 |
else: acc_score["country"][country.lower()] = ["ncbi"]
|
| 398 |
-
# if spe_loc.lower() != "unknown":
|
| 399 |
-
# acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
|
| 400 |
-
# if ethnic.lower() != "unknown":
|
| 401 |
-
# acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
|
| 402 |
if sample_type.lower() != "unknown":
|
| 403 |
acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
|
| 404 |
# second way: LLM model
|
|
@@ -446,7 +440,44 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 446 |
print(f"✅ CrossRef metadata fetched for {link}")
|
| 447 |
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 448 |
article_text = html.mergeTextInJson(metadata_text)
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
if article_text:
|
| 451 |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 452 |
out_links[link] = article_text
|
|
@@ -460,14 +491,10 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 460 |
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 461 |
# might find the article
|
| 462 |
print("no article text, start tem link")
|
| 463 |
-
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 464 |
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 465 |
print("tem links: ", tem_links)
|
| 466 |
-
tem_link_acc = smart_fallback.google_accession_search(acc)
|
| 467 |
-
tem_links += tem_link_acc
|
| 468 |
tem_links = unique_preserve_order(tem_links)
|
| 469 |
print("tem link before filtering: ", tem_links)
|
| 470 |
-
# filter the quality link
|
| 471 |
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
| 472 |
print("start the smart filter link")
|
| 473 |
if stop_flag is not None and stop_flag.value:
|
|
@@ -548,6 +575,43 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 548 |
print(f"✅ CrossRef metadata fetched for {link}")
|
| 549 |
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 550 |
article_text = html.mergeTextInJson(metadata_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
if article_text:
|
| 552 |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 553 |
out_links[link] = article_text
|
|
@@ -564,8 +628,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 564 |
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 565 |
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 566 |
print("tem links: ", tem_links)
|
| 567 |
-
tem_link_acc = smart_fallback.google_accession_search(acc)
|
| 568 |
-
tem_links += tem_link_acc
|
| 569 |
tem_links = unique_preserve_order(tem_links)
|
| 570 |
print("tem link before filtering: ", tem_links)
|
| 571 |
# filter the quality link
|
|
@@ -607,6 +669,42 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 607 |
print(f"✅ CrossRef metadata fetched for {link}")
|
| 608 |
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 609 |
article_text = html.mergeTextInJson(metadata_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
if article_text:
|
| 611 |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 612 |
out_links[link] = article_text
|
|
@@ -623,8 +721,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 623 |
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 624 |
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 625 |
print("tem links: ", tem_links)
|
| 626 |
-
tem_link_acc = smart_fallback.google_accession_search(acc)
|
| 627 |
-
tem_links += tem_link_acc
|
| 628 |
tem_links = unique_preserve_order(tem_links)
|
| 629 |
print("tem link before filtering: ", tem_links)
|
| 630 |
# filter the quality link
|
|
@@ -682,14 +778,33 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 682 |
if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 683 |
if len(all_output) > 1*1000*1000:
|
| 684 |
all_output = data_preprocess.normalize_for_overlap(all_output)
|
| 685 |
-
if len(all_output) > 1*1000*1000:
|
| 686 |
-
all_output = all_output[:1000000]
|
| 687 |
if len(chunk) > 1*1000*1000:
|
| 688 |
chunk = data_preprocess.normalize_for_overlap(chunk)
|
| 689 |
-
if len(chunk) > 1*1000*1000:
|
| 690 |
-
chunk = chunk[:1*1000*1000]
|
| 691 |
print("chunk len: ", len(chunk))
|
| 692 |
-
print("all output len: ", len(all_output))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
| 694 |
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
| 695 |
# Later when saving new files
|
|
@@ -723,13 +838,18 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 723 |
print("this is text for the last resort model")
|
| 724 |
print(text)
|
| 725 |
|
| 726 |
-
predicted_outputs, method_used, total_query_cost = model.query_document_info(
|
| 727 |
niche_cases=niche_cases,
|
| 728 |
query_word=primary_word, alternative_query_word=alternative_word,
|
|
|
|
| 729 |
metadata=meta,
|
| 730 |
master_structured_lookup=None, faiss_index=None, document_chunks=None,
|
| 731 |
llm_api_function=model.call_llm_api, chunk=text, all_output=text)
|
| 732 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
for pred_out in predicted_outputs:
|
| 734 |
# only for country, we have to standardize
|
| 735 |
if pred_out == "country_name":
|
|
|
|
| 304 |
acc_score = { "isolate": "",
|
| 305 |
"country":{},
|
| 306 |
"sample_type":{},
|
|
|
|
|
|
|
| 307 |
"query_cost":total_cost_title,
|
| 308 |
"time_cost":None,
|
| 309 |
"source":links,
|
|
|
|
| 393 |
if stand_country.lower() != "not found":
|
| 394 |
acc_score["country"][stand_country.lower()] = ["ncbi"]
|
| 395 |
else: acc_score["country"][country.lower()] = ["ncbi"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
if sample_type.lower() != "unknown":
|
| 397 |
acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
|
| 398 |
# second way: LLM model
|
|
|
|
| 440 |
print(f"✅ CrossRef metadata fetched for {link}")
|
| 441 |
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 442 |
article_text = html.mergeTextInJson(metadata_text)
|
| 443 |
+
# also try searching pubmed with the title and extract abstract and add to article text
|
| 444 |
+
# Step 1: Search for the paper
|
| 445 |
+
print("search the paper's abstract on pubmed")
|
| 446 |
+
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 447 |
+
record = Entrez.read(handle)
|
| 448 |
+
id_list = record.get("IdList", [])
|
| 449 |
+
|
| 450 |
+
if not id_list:
|
| 451 |
+
print("No PubMed results found.")
|
| 452 |
+
else:
|
| 453 |
+
pubmed_id = id_list[0]
|
| 454 |
+
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 455 |
+
fetch_record = Entrez.read(fetch_handle)
|
| 456 |
+
|
| 457 |
+
# Safe extraction
|
| 458 |
+
article = fetch_record.get("PubmedArticle", [])
|
| 459 |
+
if not article:
|
| 460 |
+
print("No PubmedArticle entry returned.")
|
| 461 |
+
else:
|
| 462 |
+
article = article[0] # the real payload
|
| 463 |
+
try:
|
| 464 |
+
abstract_sections = (
|
| 465 |
+
article["MedlineCitation"]["Article"]
|
| 466 |
+
.get("Abstract", {})
|
| 467 |
+
.get("AbstractText", [])
|
| 468 |
+
)
|
| 469 |
+
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 470 |
+
|
| 471 |
+
if full_abstract.strip():
|
| 472 |
+
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 473 |
+
#print(full_abstract)
|
| 474 |
+
article_text += full_abstract
|
| 475 |
+
else:
|
| 476 |
+
print("This article has **no abstract available on PubMed**.")
|
| 477 |
+
|
| 478 |
+
except KeyError:
|
| 479 |
+
print("Abstract field missing in this PubMed record.")
|
| 480 |
+
|
| 481 |
if article_text:
|
| 482 |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 483 |
out_links[link] = article_text
|
|
|
|
| 491 |
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 492 |
# might find the article
|
| 493 |
print("no article text, start tem link")
|
|
|
|
| 494 |
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 495 |
print("tem links: ", tem_links)
|
|
|
|
|
|
|
| 496 |
tem_links = unique_preserve_order(tem_links)
|
| 497 |
print("tem link before filtering: ", tem_links)
|
|
|
|
| 498 |
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
| 499 |
print("start the smart filter link")
|
| 500 |
if stop_flag is not None and stop_flag.value:
|
|
|
|
| 575 |
print(f"✅ CrossRef metadata fetched for {link}")
|
| 576 |
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 577 |
article_text = html.mergeTextInJson(metadata_text)
|
| 578 |
+
# Step 1: Search for the paper
|
| 579 |
+
print("search the paper's abstract on pubmed")
|
| 580 |
+
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 581 |
+
record = Entrez.read(handle)
|
| 582 |
+
id_list = record.get("IdList", [])
|
| 583 |
+
|
| 584 |
+
if not id_list:
|
| 585 |
+
print("No PubMed results found.")
|
| 586 |
+
else:
|
| 587 |
+
pubmed_id = id_list[0]
|
| 588 |
+
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 589 |
+
fetch_record = Entrez.read(fetch_handle)
|
| 590 |
+
|
| 591 |
+
# Safe extraction
|
| 592 |
+
article = fetch_record.get("PubmedArticle", [])
|
| 593 |
+
if not article:
|
| 594 |
+
print("No PubmedArticle entry returned.")
|
| 595 |
+
else:
|
| 596 |
+
article = article[0] # the real payload
|
| 597 |
+
try:
|
| 598 |
+
abstract_sections = (
|
| 599 |
+
article["MedlineCitation"]["Article"]
|
| 600 |
+
.get("Abstract", {})
|
| 601 |
+
.get("AbstractText", [])
|
| 602 |
+
)
|
| 603 |
+
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 604 |
+
|
| 605 |
+
if full_abstract.strip():
|
| 606 |
+
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 607 |
+
#print(full_abstract)
|
| 608 |
+
article_text += full_abstract
|
| 609 |
+
else:
|
| 610 |
+
print("This article has **no abstract available on PubMed**.")
|
| 611 |
+
|
| 612 |
+
except KeyError:
|
| 613 |
+
print("Abstract field missing in this PubMed record.")
|
| 614 |
+
|
| 615 |
if article_text:
|
| 616 |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 617 |
out_links[link] = article_text
|
|
|
|
| 628 |
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 629 |
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 630 |
print("tem links: ", tem_links)
|
|
|
|
|
|
|
| 631 |
tem_links = unique_preserve_order(tem_links)
|
| 632 |
print("tem link before filtering: ", tem_links)
|
| 633 |
# filter the quality link
|
|
|
|
| 669 |
print(f"✅ CrossRef metadata fetched for {link}")
|
| 670 |
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 671 |
article_text = html.mergeTextInJson(metadata_text)
|
| 672 |
+
# Step 1: Search for the paper
|
| 673 |
+
print("search the paper's abstract on pubmed")
|
| 674 |
+
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 675 |
+
record = Entrez.read(handle)
|
| 676 |
+
id_list = record.get("IdList", [])
|
| 677 |
+
|
| 678 |
+
if not id_list:
|
| 679 |
+
print("No PubMed results found.")
|
| 680 |
+
else:
|
| 681 |
+
pubmed_id = id_list[0]
|
| 682 |
+
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 683 |
+
fetch_record = Entrez.read(fetch_handle)
|
| 684 |
+
|
| 685 |
+
# Safe extraction
|
| 686 |
+
article = fetch_record.get("PubmedArticle", [])
|
| 687 |
+
if not article:
|
| 688 |
+
print("No PubmedArticle entry returned.")
|
| 689 |
+
else:
|
| 690 |
+
article = article[0] # the real payload
|
| 691 |
+
try:
|
| 692 |
+
abstract_sections = (
|
| 693 |
+
article["MedlineCitation"]["Article"]
|
| 694 |
+
.get("Abstract", {})
|
| 695 |
+
.get("AbstractText", [])
|
| 696 |
+
)
|
| 697 |
+
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 698 |
+
|
| 699 |
+
if full_abstract.strip():
|
| 700 |
+
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 701 |
+
#print(full_abstract)
|
| 702 |
+
article_text += full_abstract
|
| 703 |
+
else:
|
| 704 |
+
print("This article has **no abstract available on PubMed**.")
|
| 705 |
+
|
| 706 |
+
except KeyError:
|
| 707 |
+
print("Abstract field missing in this PubMed record.")
|
| 708 |
if article_text:
|
| 709 |
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 710 |
out_links[link] = article_text
|
|
|
|
| 721 |
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 722 |
tem_links = smart_fallback.smart_google_search(meta_expand)
|
| 723 |
print("tem links: ", tem_links)
|
|
|
|
|
|
|
| 724 |
tem_links = unique_preserve_order(tem_links)
|
| 725 |
print("tem link before filtering: ", tem_links)
|
| 726 |
# filter the quality link
|
|
|
|
| 778 |
if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 779 |
if len(all_output) > 1*1000*1000:
|
| 780 |
all_output = data_preprocess.normalize_for_overlap(all_output)
|
|
|
|
|
|
|
| 781 |
if len(chunk) > 1*1000*1000:
|
| 782 |
chunk = data_preprocess.normalize_for_overlap(chunk)
|
|
|
|
|
|
|
| 783 |
print("chunk len: ", len(chunk))
|
| 784 |
+
print("all output len: ", len(all_output))
|
| 785 |
+
# use build context for llm function to reduce token
|
| 786 |
+
reduce_context_for_llm = ""
|
| 787 |
+
if len(all_output)>900000 or len(chunk)>900000:
|
| 788 |
+
texts_reduce = []
|
| 789 |
+
out_links_reduce = {}
|
| 790 |
+
if links:
|
| 791 |
+
for link in links:
|
| 792 |
+
all_output_reduce, chunk_reduce, context_reduce = "", "",""
|
| 793 |
+
context_reduce, all_output_reduce, chunk_reduce = await process_link_chunk_allOutput(link,
|
| 794 |
+
iso, acc, sample_folder_id, out_links_reduce,
|
| 795 |
+
all_output_reduce, chunk_reduce)
|
| 796 |
+
texts_reduce.append(all_output_reduce)
|
| 797 |
+
out_links_reduce[link] = {"all_output": all_output_reduce}
|
| 798 |
+
input_prompt = ["country_name", "modern/ancient/unknown"]
|
| 799 |
+
if niche_cases: input_prompt += niche_cases
|
| 800 |
+
reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt)
|
| 801 |
+
if reduce_context_for_llm:
|
| 802 |
+
print("reduce context for llm")
|
| 803 |
+
all_output = reduce_context_for_llm
|
| 804 |
+
else:
|
| 805 |
+
print("reduce context no succeed")
|
| 806 |
+
all_output = all_output[:900000]
|
| 807 |
+
|
| 808 |
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
| 809 |
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
| 810 |
# Later when saving new files
|
|
|
|
| 838 |
print("this is text for the last resort model")
|
| 839 |
print(text)
|
| 840 |
|
| 841 |
+
predicted_outputs, method_used, total_query_cost, more_links = await model.query_document_info(
|
| 842 |
niche_cases=niche_cases,
|
| 843 |
query_word=primary_word, alternative_query_word=alternative_word,
|
| 844 |
+
saveLinkFolder = sample_folder_id,
|
| 845 |
metadata=meta,
|
| 846 |
master_structured_lookup=None, faiss_index=None, document_chunks=None,
|
| 847 |
llm_api_function=model.call_llm_api, chunk=text, all_output=text)
|
| 848 |
+
print("add more links from model.query document")
|
| 849 |
+
if more_links:
|
| 850 |
+
links += more_links
|
| 851 |
+
acc_score["source"] = links
|
| 852 |
+
print("this is llm results: ")
|
| 853 |
for pred_out in predicted_outputs:
|
| 854 |
# only for country, we have to standardize
|
| 855 |
if pred_out == "country_name":
|