Spaces:

VyLala
/

mtDNALocation

Running

VyLala commited on 10 days ago

Commit

0955309

verified ·

1 Parent(s): 1d372fa

Update NER/html/extractHTML.py

Files changed (1) hide show

NER/html/extractHTML.py CHANGED Viewed

@@ -173,18 +173,23 @@ class HTML():
         # Use preloaded HTML (fast path)
         soup = self.openHTMLFile()
-        h2_tags = soup.find_all('h2')
-        for idx, h2 in enumerate(h2_tags):
-            section_title = h2.get_text(strip=True)
-            json.setdefault(section_title, [])
-            next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
-            for p in h2.find_all_next("p"):
-                if next_h2 and p == next_h2:
-                    break
-                json[section_title].append(p.get_text(strip=True))
         # If no sections or explicitly ScienceDirect
-        if scienceDirect is not None or len(json) == 0:
             print("async fetching ScienceDirect metadata...")
             api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
             doi = self.htmlLink.split("https://doi.org/")[-1]
@@ -197,6 +202,7 @@ class HTML():
             #             data = await resp.json()
             #             if isinstance(data, dict):
             #                 json["fullText"] = data
             try:
                 timeout = aiohttp.ClientTimeout(total=8)  # hard 8 seconds
                 async with aiohttp.ClientSession(timeout=timeout) as session:

         # Use preloaded HTML (fast path)
         soup = self.openHTMLFile()
+        try:
+            h2_tags = soup.find_all('h2')
+            for idx, h2 in enumerate(h2_tags):
+                section_title = h2.get_text(strip=True)
+                json.setdefault(section_title, [])
+                next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
+                for p in h2.find_all_next("p"):
+                    if next_h2 and p == next_h2:
+                        break
+                    json[section_title].append(p.get_text(strip=True))
+        except Exception:
+            pass  # continue to fallback
         # If no sections or explicitly ScienceDirect
+        is_sciencedirect_source = "sciencedirect" in self.htmlLink.lower()
+        #if scienceDirect is not None or len(json) == 0:
+        if is_sciencedirect_source and (scienceDirect is not None or len(json) == 0):
             print("async fetching ScienceDirect metadata...")
             api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
             doi = self.htmlLink.split("https://doi.org/")[-1]
             #             data = await resp.json()
             #             if isinstance(data, dict):
             #                 json["fullText"] = data
             try:
                 timeout = aiohttp.ClientTimeout(total=8)  # hard 8 seconds
                 async with aiohttp.ClientSession(timeout=timeout) as session: