Rastra Wardana Nanditama
Saya melakukan text processing pada Sentiment Labelled Data, text processing yang dilakukan adalah Filtering, Case Folding, dan Stopwords Removal.
from google.colab import drive drive.mount('/content/drive') |
import pandas as pd import re import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') |
data_amazon = pd.read_csv("/content/drive/My Drive/dataset/sentiment labelled sentences/amazon_cells_labelled.txt", delimiter=' ', header=None, names=['kalimat', 'kelas']) data_imdb = pd.read_csv("/content/drive/My Drive/dataset/sentiment labelled sentences/imdb_labelled.txt", delimiter=' ', header=None, names=['kalimat', 'kelas']) data_yelp = pd.read_csv("/content/drive/My Drive/dataset/sentiment labelled sentences/yelp_labelled.txt", delimiter=' ', header=None, names=['kalimat', 'kelas'])
combine = pd.DataFrame() combine = pd.concat([data_amazon, data_imdb, data_yelp]) combine |
kalimat = combine.iloc[:, 0].values kelas = combine.iloc[:, 1].values
klmt = [] for text in range(0 , len(kalimat)): txt = re.sub(r'[^ws]', ' ', str(kalimat[text]))#hapus tanda baca dan spesial karakter txt = txt.lower()#mengubah menjadi lowercase txt = re.sub(r'd+', ' ', txt)#menghapus angka txt = re.sub (r's+', ' ', txt)#menhapus 2 spasi klmt.append(txt)
combine = pd.DataFrame(klmt) combine.columns = ['kalimat'] combine['kelas'] = kelas combine.head() |
stopwrd = stopwords.words('english') combine['kalimat'] = combine['kalimat'].apply(lambda x:' '.join([word for word in x.split()if word not in (stopwrd)])) combine.head(10) |