Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

mtDNALocation / NER /WordDoc /wordDoc.py

VyLala

Upload 54 files

4a80798 verified 4 months ago

raw

history blame contribute delete

11.9 kB

	#! pip install spire.doc
	#! pip install Spire.XLS
	import pandas as pd
	from spire.doc import *
	from spire.doc.common import *
	from spire.xls import *
	from spire.xls.common import *
	from NER import cleanText
	import requests
	class wordDoc(): # using python-docx
	def __init__(self, wordDoc,saveFolder):
	self.wordDoc = wordDoc
	self.saveFolder = saveFolder
	def openFile(self):
	document = Document()
	return document.LoadFromFile(self.wordDoc)
	def extractTextByPage(self):
	# reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
	json = {}
	#doc = self.openFile()
	# Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
	try:
	doc = Document()
	doc.LoadFromFile(self.wordDoc)
	except:
	response = requests.get(self.wordDoc)
	name = self.wordDoc.split("/")[-1]
	with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
	temp_file.write(response.content)
	doc = Document()
	doc.LoadFromFile(self.saveFolder+"/" + name)
	text = doc.GetText()
	return text
	def extractTableAsText(self):
	getDoc = ''
	try:
	# reference:
	# https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
	doc = Document()
	doc.LoadFromFile(self.wordDoc)
	getDoc = "have document"
	except:
	response = requests.get(self.wordDoc)
	name = self.wordDoc.split("/")[-1]
	with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
	temp_file.write(response.content)
	doc = Document()
	doc.LoadFromFile(self.saveFolder+"/" + name)
	getDoc = "have document"
	json = {}
	if len(getDoc) > 0:
	# Loop through the sections
	for s in range(doc.Sections.Count):
	# Get a section
	section = doc.Sections.get_Item(s)
	# Get the tables in the section
	json["Section" + str(s)] = {}
	tables = section.Tables
	# Loop through the tables
	for i in range(0, tables.Count):
	# Get a table
	table = tables.get_Item(i)
	# Initialize a string to store the table data
	tableData = ''
	# Loop through the rows of the table
	for j in range(0, table.Rows.Count):
	# Loop through the cells of the row
	for k in range(0, table.Rows.get_Item(j).Cells.Count):
	# Get a cell
	cell = table.Rows.get_Item(j).Cells.get_Item(k)
	# Get the text in the cell
	cellText = ''
	for para in range(cell.Paragraphs.Count):
	paragraphText = cell.Paragraphs.get_Item(para).Text
	cellText += (paragraphText + ' ')
	# Add the text to the string
	tableData += cellText
	if k < table.Rows.get_Item(j).Cells.Count - 1:
	tableData += '\t'
	# Add a new line
	tableData += '\n'
	json["Section" + str(s)]["Table"+str(i)] = tableData
	return json
	def extractTableAsList(self):
	tables = []
	try:
	doc = Document()
	doc.LoadFromFile(self.wordDoc)
	except:
	response = requests.get(self.wordDoc)
	name = self.wordDoc.split("/")[-1]
	with open(os.path.join(self.saveFolder, name), "wb") as f:
	f.write(response.content)
	doc = Document()
	doc.LoadFromFile(os.path.join(self.saveFolder, name))

	for s in range(doc.Sections.Count):
	section = doc.Sections.get_Item(s)
	for i in range(section.Tables.Count):
	table = section.Tables.get_Item(i)
	table_data = []
	for row in range(table.Rows.Count):
	row_data = []
	for cell in range(table.Rows.get_Item(row).Cells.Count):
	cell_obj = table.Rows.get_Item(row).Cells.get_Item(cell)
	cell_text = ""
	for p in range(cell_obj.Paragraphs.Count):
	cell_text += cell_obj.Paragraphs.get_Item(p).Text.strip() + " "
	row_data.append(cell_text.strip())
	table_data.append(row_data)
	tables.append(table_data)
	return tables
	def extractTableAsExcel(self):
	getDoc = ''
	try:
	# reference:
	# https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
	doc = Document()
	doc.LoadFromFile(self.wordDoc)
	getDoc = "have document"
	except:
	response = requests.get(self.wordDoc)
	name = self.wordDoc.split("/")[-1]
	with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
	temp_file.write(response.content)
	doc = Document()
	doc.LoadFromFile(self.saveFolder+"/" + name)
	getDoc = "have document"
	if len(getDoc) > 0:
	try:
	# Create an instance of Workbook
	wb = Workbook()
	wb.Worksheets.Clear()

	# Loop through sections in the document
	for i in range(doc.Sections.Count):
	# Get a section
	section = doc.Sections.get_Item(i)
	# Loop through tables in the section
	for j in range(section.Tables.Count):
	# Get a table
	table = section.Tables.get_Item(j)
	# Create a worksheet
	ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
	# Write the table to the worksheet
	for row in range(table.Rows.Count):
	# Get a row
	tableRow = table.Rows.get_Item(row)
	# Loop through cells in the row
	for cell in range(tableRow.Cells.Count):
	# Get a cell
	tableCell = tableRow.Cells.get_Item(cell)
	# Get the text in the cell
	cellText = ''
	for paragraph in range(tableCell.Paragraphs.Count):
	paragraph = tableCell.Paragraphs.get_Item(paragraph)
	cellText = cellText + (paragraph.Text + ' ')
	# Write the cell text to the worksheet
	ws.SetCellValue(row + 1, cell + 1, cellText)

	# Save the workbook
	name = self.wordDoc.split("/")[-1]
	if self.saveFolder == None:
	wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
	nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
	else:
	wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
	nameFile = self.saveFolder+'/'+name + ".xlsx"
	doc.Close()
	wb.Dispose()
	return nameFile
	except: return "No table found on word doc"
	else:
	return "No table found on word doc"
	def getReference(self):
	pass
	def getSupMaterial(self):
	pass

	import os
	import requests
	from spire.doc import Document
	from spire.doc.common import *
	from spire.xls import Workbook, FileFormat

	class WordDocFast:
	_cache = {} # Cache Document objects by file path/URL

	def __init__(self, wordDoc, saveFolder):
	self.wordDoc = wordDoc
	self.saveFolder = saveFolder or "."
	self.doc = self._load_document()

	def _load_document(self):
	# Use cache if available
	if self.wordDoc in WordDocFast._cache:
	return WordDocFast._cache[self.wordDoc]

	local_path = self.wordDoc
	if self.wordDoc.startswith("http"):
	name = os.path.basename(self.wordDoc)
	local_path = os.path.join(self.saveFolder, name)
	if not os.path.exists(local_path):
	r = requests.get(self.wordDoc, timeout=15)
	r.raise_for_status()
	with open(local_path, "wb") as f:
	f.write(r.content)

	doc = Document()
	doc.LoadFromFile(local_path)
	WordDocFast._cache[self.wordDoc] = doc
	return doc

	def extractText(self):
	"""Extract full text (faster than page-by-page parsing)."""
	try:
	return self.doc.GetText()
	except:
	try:
	return self.extractTextBySections()
	except:
	print("extract word doc text failed")
	return ''
	def extractTextBySections(self):
	"""Stream text section-by-section (can be faster for large docs)."""
	all_text = []
	for s in range(self.doc.Sections.Count):
	section = self.doc.Sections.get_Item(s)
	for p in range(section.Paragraphs.Count):
	text = section.Paragraphs.get_Item(p).Text.strip()
	if text:
	all_text.append(text)
	return "\n".join(all_text)

	def extractTablesAsList(self):
	"""Extract tables as list-of-lists (faster)."""
	tables = []
	for s in range(self.doc.Sections.Count):
	section = self.doc.Sections.get_Item(s)
	for t in range(section.Tables.Count):
	table = section.Tables.get_Item(t)
	table_data = []
	for r in range(table.Rows.Count):
	row_data = []
	for c in range(table.Rows.get_Item(r).Cells.Count):
	cell = table.Rows.get_Item(r).Cells.get_Item(c)
	cell_text = " ".join(
	cell.Paragraphs.get_Item(p).Text.strip()
	for p in range(cell.Paragraphs.Count)
	).strip()
	row_data.append(cell_text)
	table_data.append(row_data)
	tables.append(table_data)
	return tables

	def extractTablesAsExcel(self):
	"""Export tables to Excel."""
	wb = Workbook()
	wb.Worksheets.Clear()
	for s in range(self.doc.Sections.Count):
	section = self.doc.Sections.get_Item(s)
	for t in range(section.Tables.Count):
	table = section.Tables.get_Item(t)
	ws = wb.Worksheets.Add(f"Table_{s+1}_{t+1}")
	for r in range(table.Rows.Count):
	row = table.Rows.get_Item(r)
	for c in range(row.Cells.Count):
	cell = row.Cells.get_Item(c)
	cell_text = " ".join(
	cell.Paragraphs.get_Item(p).Text
	for p in range(cell.Paragraphs.Count)
	).strip()
	ws.SetCellValue(r + 1, c + 1, cell_text)
	name = os.path.basename(self.wordDoc) + ".xlsx"
	out_path = os.path.join(self.saveFolder, name)
	wb.SaveToFile(out_path, FileFormat.Version2016)
	wb.Dispose()
	return out_path