Spaces:
Running
Running
Update data_preprocess.py
Browse files- data_preprocess.py +14 -0
data_preprocess.py
CHANGED
|
@@ -21,6 +21,20 @@ import pipeline
|
|
| 21 |
import tempfile
|
| 22 |
import nltk
|
| 23 |
nltk.download('punkt_tab')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def download_excel_file(url, save_path="temp.xlsx"):
|
| 25 |
if "view.officeapps.live.com" in url:
|
| 26 |
parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
|
|
|
|
| 21 |
import tempfile
|
| 22 |
import nltk
|
| 23 |
nltk.download('punkt_tab')
|
| 24 |
+
try:
|
| 25 |
+
nltk.data.find('corpora/stopwords')
|
| 26 |
+
except LookupError:
|
| 27 |
+
try:
|
| 28 |
+
nltk.download('stopwords')
|
| 29 |
+
except:
|
| 30 |
+
print("have to use our own created stopword")
|
| 31 |
+
STOPWORDS = {
|
| 32 |
+
"the","a","an","in","on","of","and","or","for","with","to","from",
|
| 33 |
+
"is","are","was","were","be","been","by","this","that","these","those",
|
| 34 |
+
"it","its","as","at","but","not","no","so","if","their","there","about",
|
| 35 |
+
"into","such","than","other","then","also","can","may","might","should"
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
def download_excel_file(url, save_path="temp.xlsx"):
|
| 39 |
if "view.officeapps.live.com" in url:
|
| 40 |
parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
|