VyLala commited on
Commit
3f0149b
·
verified ·
1 Parent(s): 86e6372

Update data_preprocess.py

Browse files
Files changed (1) hide show
  1. data_preprocess.py +14 -0
data_preprocess.py CHANGED
@@ -21,6 +21,20 @@ import pipeline
21
  import tempfile
22
  import nltk
23
  nltk.download('punkt_tab')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def download_excel_file(url, save_path="temp.xlsx"):
25
  if "view.officeapps.live.com" in url:
26
  parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
 
21
  import tempfile
22
  import nltk
23
  nltk.download('punkt_tab')
24
+ try:
25
+ nltk.data.find('corpora/stopwords')
26
+ except LookupError:
27
+ try:
28
+ nltk.download('stopwords')
29
+ except:
30
+ print("have to use our own created stopword")
31
+ STOPWORDS = {
32
+ "the","a","an","in","on","of","and","or","for","with","to","from",
33
+ "is","are","was","were","be","been","by","this","that","these","those",
34
+ "it","its","as","at","but","not","no","so","if","their","there","about",
35
+ "into","such","than","other","then","also","can","may","might","should"
36
+ }
37
+
38
  def download_excel_file(url, save_path="temp.xlsx"):
39
  if "view.officeapps.live.com" in url:
40
  parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)