Spaces:
Running
Running
Update NER/html/extractHTML.py
Browse files- NER/html/extractHTML.py +17 -11
NER/html/extractHTML.py
CHANGED
|
@@ -173,18 +173,23 @@ class HTML():
|
|
| 173 |
|
| 174 |
# Use preloaded HTML (fast path)
|
| 175 |
soup = self.openHTMLFile()
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
| 186 |
# If no sections or explicitly ScienceDirect
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
| 188 |
print("async fetching ScienceDirect metadata...")
|
| 189 |
api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
|
| 190 |
doi = self.htmlLink.split("https://doi.org/")[-1]
|
|
@@ -197,6 +202,7 @@ class HTML():
|
|
| 197 |
# data = await resp.json()
|
| 198 |
# if isinstance(data, dict):
|
| 199 |
# json["fullText"] = data
|
|
|
|
| 200 |
try:
|
| 201 |
timeout = aiohttp.ClientTimeout(total=8) # hard 8 seconds
|
| 202 |
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
|
|
| 173 |
|
| 174 |
# Use preloaded HTML (fast path)
|
| 175 |
soup = self.openHTMLFile()
|
| 176 |
+
try:
|
| 177 |
+
h2_tags = soup.find_all('h2')
|
| 178 |
+
for idx, h2 in enumerate(h2_tags):
|
| 179 |
+
section_title = h2.get_text(strip=True)
|
| 180 |
+
json.setdefault(section_title, [])
|
| 181 |
+
next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
|
| 182 |
+
for p in h2.find_all_next("p"):
|
| 183 |
+
if next_h2 and p == next_h2:
|
| 184 |
+
break
|
| 185 |
+
json[section_title].append(p.get_text(strip=True))
|
| 186 |
+
except Exception:
|
| 187 |
+
pass # continue to fallback
|
| 188 |
# If no sections or explicitly ScienceDirect
|
| 189 |
+
is_sciencedirect_source = "sciencedirect" in self.htmlLink.lower()
|
| 190 |
+
|
| 191 |
+
#if scienceDirect is not None or len(json) == 0:
|
| 192 |
+
if is_sciencedirect_source and (scienceDirect is not None or len(json) == 0):
|
| 193 |
print("async fetching ScienceDirect metadata...")
|
| 194 |
api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
|
| 195 |
doi = self.htmlLink.split("https://doi.org/")[-1]
|
|
|
|
| 202 |
# data = await resp.json()
|
| 203 |
# if isinstance(data, dict):
|
| 204 |
# json["fullText"] = data
|
| 205 |
+
|
| 206 |
try:
|
| 207 |
timeout = aiohttp.ClientTimeout(total=8) # hard 8 seconds
|
| 208 |
async with aiohttp.ClientSession(timeout=timeout) as session:
|