VyLala commited on
Commit
04a4324
·
verified ·
1 Parent(s): 172369b

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +142 -22
pipeline.py CHANGED
@@ -304,8 +304,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
304
  acc_score = { "isolate": "",
305
  "country":{},
306
  "sample_type":{},
307
- #"specific_location":{},
308
- #"ethnicity":{},
309
  "query_cost":total_cost_title,
310
  "time_cost":None,
311
  "source":links,
@@ -395,10 +393,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
395
  if stand_country.lower() != "not found":
396
  acc_score["country"][stand_country.lower()] = ["ncbi"]
397
  else: acc_score["country"][country.lower()] = ["ncbi"]
398
- # if spe_loc.lower() != "unknown":
399
- # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
400
- # if ethnic.lower() != "unknown":
401
- # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
402
  if sample_type.lower() != "unknown":
403
  acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
404
  # second way: LLM model
@@ -446,7 +440,44 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
446
  print(f"✅ CrossRef metadata fetched for {link}")
447
  other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
448
  article_text = html.mergeTextInJson(metadata_text)
449
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  if article_text:
451
  if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
452
  out_links[link] = article_text
@@ -460,14 +491,10 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
460
  if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
461
  # might find the article
462
  print("no article text, start tem link")
463
- #tem_links = mtdna_classifier.search_google_custom(title, 2)
464
  tem_links = smart_fallback.smart_google_search(meta_expand)
465
  print("tem links: ", tem_links)
466
- tem_link_acc = smart_fallback.google_accession_search(acc)
467
- tem_links += tem_link_acc
468
  tem_links = unique_preserve_order(tem_links)
469
  print("tem link before filtering: ", tem_links)
470
- # filter the quality link
471
  print("saveLinkFolder as sample folder id: ", sample_folder_id)
472
  print("start the smart filter link")
473
  if stop_flag is not None and stop_flag.value:
@@ -548,6 +575,43 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
548
  print(f"✅ CrossRef metadata fetched for {link}")
549
  other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
550
  article_text = html.mergeTextInJson(metadata_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  if article_text:
552
  if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
553
  out_links[link] = article_text
@@ -564,8 +628,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
564
  #tem_links = mtdna_classifier.search_google_custom(title, 2)
565
  tem_links = smart_fallback.smart_google_search(meta_expand)
566
  print("tem links: ", tem_links)
567
- tem_link_acc = smart_fallback.google_accession_search(acc)
568
- tem_links += tem_link_acc
569
  tem_links = unique_preserve_order(tem_links)
570
  print("tem link before filtering: ", tem_links)
571
  # filter the quality link
@@ -607,6 +669,42 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
607
  print(f"✅ CrossRef metadata fetched for {link}")
608
  other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
609
  article_text = html.mergeTextInJson(metadata_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  if article_text:
611
  if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
612
  out_links[link] = article_text
@@ -623,8 +721,6 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
623
  #tem_links = mtdna_classifier.search_google_custom(title, 2)
624
  tem_links = smart_fallback.smart_google_search(meta_expand)
625
  print("tem links: ", tem_links)
626
- tem_link_acc = smart_fallback.google_accession_search(acc)
627
- tem_links += tem_link_acc
628
  tem_links = unique_preserve_order(tem_links)
629
  print("tem link before filtering: ", tem_links)
630
  # filter the quality link
@@ -682,14 +778,33 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
682
  if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
683
  if len(all_output) > 1*1000*1000:
684
  all_output = data_preprocess.normalize_for_overlap(all_output)
685
- if len(all_output) > 1*1000*1000:
686
- all_output = all_output[:1000000]
687
  if len(chunk) > 1*1000*1000:
688
  chunk = data_preprocess.normalize_for_overlap(chunk)
689
- if len(chunk) > 1*1000*1000:
690
- chunk = chunk[:1*1000*1000]
691
  print("chunk len: ", len(chunk))
692
- print("all output len: ", len(all_output))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
  data_preprocess.save_text_to_docx(chunk, file_chunk_path)
694
  data_preprocess.save_text_to_docx(all_output, file_all_path)
695
  # Later when saving new files
@@ -723,13 +838,18 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
723
  print("this is text for the last resort model")
724
  print(text)
725
 
726
- predicted_outputs, method_used, total_query_cost = model.query_document_info(
727
  niche_cases=niche_cases,
728
  query_word=primary_word, alternative_query_word=alternative_word,
 
729
  metadata=meta,
730
  master_structured_lookup=None, faiss_index=None, document_chunks=None,
731
  llm_api_function=model.call_llm_api, chunk=text, all_output=text)
732
- print("this is last resort results: ")
 
 
 
 
733
  for pred_out in predicted_outputs:
734
  # only for country, we have to standardize
735
  if pred_out == "country_name":
 
304
  acc_score = { "isolate": "",
305
  "country":{},
306
  "sample_type":{},
 
 
307
  "query_cost":total_cost_title,
308
  "time_cost":None,
309
  "source":links,
 
393
  if stand_country.lower() != "not found":
394
  acc_score["country"][stand_country.lower()] = ["ncbi"]
395
  else: acc_score["country"][country.lower()] = ["ncbi"]
 
 
 
 
396
  if sample_type.lower() != "unknown":
397
  acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
398
  # second way: LLM model
 
440
  print(f"✅ CrossRef metadata fetched for {link}")
441
  other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
442
  article_text = html.mergeTextInJson(metadata_text)
443
+ # also try searching pubmed with the title and extract abstract and add to article text
444
+ # Step 1: Search for the paper
445
+ print("search the paper's abstract on pubmed")
446
+ handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
447
+ record = Entrez.read(handle)
448
+ id_list = record.get("IdList", [])
449
+
450
+ if not id_list:
451
+ print("No PubMed results found.")
452
+ else:
453
+ pubmed_id = id_list[0]
454
+ fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
455
+ fetch_record = Entrez.read(fetch_handle)
456
+
457
+ # Safe extraction
458
+ article = fetch_record.get("PubmedArticle", [])
459
+ if not article:
460
+ print("No PubmedArticle entry returned.")
461
+ else:
462
+ article = article[0] # the real payload
463
+ try:
464
+ abstract_sections = (
465
+ article["MedlineCitation"]["Article"]
466
+ .get("Abstract", {})
467
+ .get("AbstractText", [])
468
+ )
469
+ full_abstract = " ".join(str(s) for s in abstract_sections)
470
+
471
+ if full_abstract.strip():
472
+ print("Abstract found (len={}):".format(len(full_abstract)))
473
+ #print(full_abstract)
474
+ article_text += full_abstract
475
+ else:
476
+ print("This article has **no abstract available on PubMed**.")
477
+
478
+ except KeyError:
479
+ print("Abstract field missing in this PubMed record.")
480
+
481
  if article_text:
482
  if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
483
  out_links[link] = article_text
 
491
  if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
492
  # might find the article
493
  print("no article text, start tem link")
 
494
  tem_links = smart_fallback.smart_google_search(meta_expand)
495
  print("tem links: ", tem_links)
 
 
496
  tem_links = unique_preserve_order(tem_links)
497
  print("tem link before filtering: ", tem_links)
 
498
  print("saveLinkFolder as sample folder id: ", sample_folder_id)
499
  print("start the smart filter link")
500
  if stop_flag is not None and stop_flag.value:
 
575
  print(f"✅ CrossRef metadata fetched for {link}")
576
  other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
577
  article_text = html.mergeTextInJson(metadata_text)
578
+ # Step 1: Search for the paper
579
+ print("search the paper's abstract on pubmed")
580
+ handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
581
+ record = Entrez.read(handle)
582
+ id_list = record.get("IdList", [])
583
+
584
+ if not id_list:
585
+ print("No PubMed results found.")
586
+ else:
587
+ pubmed_id = id_list[0]
588
+ fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
589
+ fetch_record = Entrez.read(fetch_handle)
590
+
591
+ # Safe extraction
592
+ article = fetch_record.get("PubmedArticle", [])
593
+ if not article:
594
+ print("No PubmedArticle entry returned.")
595
+ else:
596
+ article = article[0] # the real payload
597
+ try:
598
+ abstract_sections = (
599
+ article["MedlineCitation"]["Article"]
600
+ .get("Abstract", {})
601
+ .get("AbstractText", [])
602
+ )
603
+ full_abstract = " ".join(str(s) for s in abstract_sections)
604
+
605
+ if full_abstract.strip():
606
+ print("Abstract found (len={}):".format(len(full_abstract)))
607
+ #print(full_abstract)
608
+ article_text += full_abstract
609
+ else:
610
+ print("This article has **no abstract available on PubMed**.")
611
+
612
+ except KeyError:
613
+ print("Abstract field missing in this PubMed record.")
614
+
615
  if article_text:
616
  if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
617
  out_links[link] = article_text
 
628
  #tem_links = mtdna_classifier.search_google_custom(title, 2)
629
  tem_links = smart_fallback.smart_google_search(meta_expand)
630
  print("tem links: ", tem_links)
 
 
631
  tem_links = unique_preserve_order(tem_links)
632
  print("tem link before filtering: ", tem_links)
633
  # filter the quality link
 
669
  print(f"✅ CrossRef metadata fetched for {link}")
670
  other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
671
  article_text = html.mergeTextInJson(metadata_text)
672
+ # Step 1: Search for the paper
673
+ print("search the paper's abstract on pubmed")
674
+ handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
675
+ record = Entrez.read(handle)
676
+ id_list = record.get("IdList", [])
677
+
678
+ if not id_list:
679
+ print("No PubMed results found.")
680
+ else:
681
+ pubmed_id = id_list[0]
682
+ fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
683
+ fetch_record = Entrez.read(fetch_handle)
684
+
685
+ # Safe extraction
686
+ article = fetch_record.get("PubmedArticle", [])
687
+ if not article:
688
+ print("No PubmedArticle entry returned.")
689
+ else:
690
+ article = article[0] # the real payload
691
+ try:
692
+ abstract_sections = (
693
+ article["MedlineCitation"]["Article"]
694
+ .get("Abstract", {})
695
+ .get("AbstractText", [])
696
+ )
697
+ full_abstract = " ".join(str(s) for s in abstract_sections)
698
+
699
+ if full_abstract.strip():
700
+ print("Abstract found (len={}):".format(len(full_abstract)))
701
+ #print(full_abstract)
702
+ article_text += full_abstract
703
+ else:
704
+ print("This article has **no abstract available on PubMed**.")
705
+
706
+ except KeyError:
707
+ print("Abstract field missing in this PubMed record.")
708
  if article_text:
709
  if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
710
  out_links[link] = article_text
 
721
  #tem_links = mtdna_classifier.search_google_custom(title, 2)
722
  tem_links = smart_fallback.smart_google_search(meta_expand)
723
  print("tem links: ", tem_links)
 
 
724
  tem_links = unique_preserve_order(tem_links)
725
  print("tem link before filtering: ", tem_links)
726
  # filter the quality link
 
778
  if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
779
  if len(all_output) > 1*1000*1000:
780
  all_output = data_preprocess.normalize_for_overlap(all_output)
 
 
781
  if len(chunk) > 1*1000*1000:
782
  chunk = data_preprocess.normalize_for_overlap(chunk)
 
 
783
  print("chunk len: ", len(chunk))
784
+ print("all output len: ", len(all_output))
785
+ # use build context for llm function to reduce token
786
+ reduce_context_for_llm = ""
787
+ if len(all_output)>900000 or len(chunk)>900000:
788
+ texts_reduce = []
789
+ out_links_reduce = {}
790
+ if links:
791
+ for link in links:
792
+ all_output_reduce, chunk_reduce, context_reduce = "", "",""
793
+ context_reduce, all_output_reduce, chunk_reduce = await process_link_chunk_allOutput(link,
794
+ iso, acc, sample_folder_id, out_links_reduce,
795
+ all_output_reduce, chunk_reduce)
796
+ texts_reduce.append(all_output_reduce)
797
+ out_links_reduce[link] = {"all_output": all_output_reduce}
798
+ input_prompt = ["country_name", "modern/ancient/unknown"]
799
+ if niche_cases: input_prompt += niche_cases
800
+ reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt)
801
+ if reduce_context_for_llm:
802
+ print("reduce context for llm")
803
+ all_output = reduce_context_for_llm
804
+ else:
805
+ print("reduce context no succeed")
806
+ all_output = all_output[:900000]
807
+
808
  data_preprocess.save_text_to_docx(chunk, file_chunk_path)
809
  data_preprocess.save_text_to_docx(all_output, file_all_path)
810
  # Later when saving new files
 
838
  print("this is text for the last resort model")
839
  print(text)
840
 
841
+ predicted_outputs, method_used, total_query_cost, more_links = await model.query_document_info(
842
  niche_cases=niche_cases,
843
  query_word=primary_word, alternative_query_word=alternative_word,
844
+ saveLinkFolder = sample_folder_id,
845
  metadata=meta,
846
  master_structured_lookup=None, faiss_index=None, document_chunks=None,
847
  llm_api_function=model.call_llm_api, chunk=text, all_output=text)
848
+ print("add more links from model.query document")
849
+ if more_links:
850
+ links += more_links
851
+ acc_score["source"] = links
852
+ print("this is llm results: ")
853
  for pred_out in predicted_outputs:
854
  # only for country, we have to standardize
855
  if pred_out == "country_name":