VyLala commited on
Commit
0955309
·
verified ·
1 Parent(s): 1d372fa

Update NER/html/extractHTML.py

Browse files
Files changed (1) hide show
  1. NER/html/extractHTML.py +17 -11
NER/html/extractHTML.py CHANGED
@@ -173,18 +173,23 @@ class HTML():
173
 
174
  # Use preloaded HTML (fast path)
175
  soup = self.openHTMLFile()
176
- h2_tags = soup.find_all('h2')
177
- for idx, h2 in enumerate(h2_tags):
178
- section_title = h2.get_text(strip=True)
179
- json.setdefault(section_title, [])
180
- next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
181
- for p in h2.find_all_next("p"):
182
- if next_h2 and p == next_h2:
183
- break
184
- json[section_title].append(p.get_text(strip=True))
185
-
 
 
186
  # If no sections or explicitly ScienceDirect
187
- if scienceDirect is not None or len(json) == 0:
 
 
 
188
  print("async fetching ScienceDirect metadata...")
189
  api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
190
  doi = self.htmlLink.split("https://doi.org/")[-1]
@@ -197,6 +202,7 @@ class HTML():
197
  # data = await resp.json()
198
  # if isinstance(data, dict):
199
  # json["fullText"] = data
 
200
  try:
201
  timeout = aiohttp.ClientTimeout(total=8) # hard 8 seconds
202
  async with aiohttp.ClientSession(timeout=timeout) as session:
 
173
 
174
  # Use preloaded HTML (fast path)
175
  soup = self.openHTMLFile()
176
+ try:
177
+ h2_tags = soup.find_all('h2')
178
+ for idx, h2 in enumerate(h2_tags):
179
+ section_title = h2.get_text(strip=True)
180
+ json.setdefault(section_title, [])
181
+ next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
182
+ for p in h2.find_all_next("p"):
183
+ if next_h2 and p == next_h2:
184
+ break
185
+ json[section_title].append(p.get_text(strip=True))
186
+ except Exception:
187
+ pass # continue to fallback
188
  # If no sections or explicitly ScienceDirect
189
+ is_sciencedirect_source = "sciencedirect" in self.htmlLink.lower()
190
+
191
+ #if scienceDirect is not None or len(json) == 0:
192
+ if is_sciencedirect_source and (scienceDirect is not None or len(json) == 0):
193
  print("async fetching ScienceDirect metadata...")
194
  api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
195
  doi = self.htmlLink.split("https://doi.org/")[-1]
 
202
  # data = await resp.json()
203
  # if isinstance(data, dict):
204
  # json["fullText"] = data
205
+
206
  try:
207
  timeout = aiohttp.ClientTimeout(total=8) # hard 8 seconds
208
  async with aiohttp.ClientSession(timeout=timeout) as session: