Sentiment Analysis on Farmer 's Protest Using RNN-LSTM

Dataset : https://drive.google.com/drive/folders/1vcFqYVicEA665l7ezUSYIY7f-fj6u0W7?usp=sharing

Reference:

Ashwin Sanjay Neogi, Kirti Anilkumar Garg, Ram Krishn Mishra, Yogesh K Dwivedi, "Sentiment analysis and classification of Indian farmers’ protest using twitter data", International Journal of Information Management Data Insights, Volume 1, Issue 2, 2021, 100019, ISSN 2667-0968, https://doi.org/10.1016/j.jjimei.2021.100019.

Dataset Description:

A total of 18,000 tweets have been collected over a period of four months.
Since the farmers’ protest began around the month of November 2020, we chose the starting date as 5th November and the ending date as 5th March 2021. The DateTime library was incorporated for a customized script which was written to explicitly retrieve 150 tweets per day and used to store them in a python list
We used the keyword ‘farmers protest’ as a search query wherein all the tweets which contained the words “farmers”, “protest”, and “farmers protest” were conglomerated together

Importing Required Library

import numpy as np

import pandas as pd

from keras.models import Sequential

from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

from keras.utils.np_utils import to_categorical

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

import re

!pip install twython

import nltk

nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

Reading Dataset

# Import the data file

df = pd.read_csv("Tweets.csv")

#df = df[1:number+1]

df = df['Text']

df = pd.DataFrame({'Text':df})

# Clean the tweets

def cleantext(df):

### dont change the original tweet

df['cleaned_tweet'] = df['Text'].replace(" ", " ")

#remove user mentions

df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)

#remove 'rt' in the beginning

df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"", regex=True)

#remove_URL(x):

df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$', "", regex = True)

#remove 'amp' in the text

df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"", regex = True)

df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https',"", regex = True)

#remove extra spaces in the tweet

df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)

return df

#get the processed tweets

df = cleantext(df)

Print Columns

print(df.columns)

Finding Sentiment Score

df['scores'] = df['cleaned_tweet'].apply(lambda Description: sid.polarity_scores(Description))

df.head()

Compound Score

df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])

df['sentiment']=''

df.loc[df.compound>0,'sentiment']='Positive'

df.loc[df.compound==0,'sentiment']='Neutral'

df.loc[df.compound<0,'sentiment']='Negative'

df.head()

Count the Sentiment Value

print(df['sentiment'].value_counts())

Create new Dataframe

new_df = df[['cleaned_tweet','sentiment']]

print(new_df.sentiment)

Removing Neutral Sentiments

new_df = new_df[new_df.sentiment != "Neutral"]

new_df.head()

Tokenization

tokenizer = Tokenizer(num_words=1500, split=' ')

tokenizer.fit_on_texts(new_df['cleaned_tweet'].values)

X = tokenizer.texts_to_sequences(new_df['cleaned_tweet'])

X = pad_sequences(X)

Model Building using RNN-LSTM

embed_dim = 128

lstm_out = 196

model = Sequential()

model.add(Embedding(10018, embed_dim,input_length = 28))

model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

from sklearn.preprocessing import LabelEncoder

Le = LabelEncoder()

y = Le.fit_transform(new_df['sentiment'])

X_train, X_test, Y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 42)

len(y)

print(Y_train)

history = model.fit(X_train, Y_train,validation_data = (X_test,y_test),epochs = 5, batch_size=32)

model.evaluate(X_test,y_test)

print("Prediction: ",model.predict_classes(X_test[5:10]))

print("Actual: \n",y_test[5:10])

import matplotlib.pyplot as plt

plt.figure(figsize = (10,8))

print(history.history.keys())

# summarize history for accuracy

plt.plot(history.history['accuracy'])

plt.plot(history.history['val_accuracy'])

plt.title('model accuracy')

plt.ylabel('accuracy')

plt.xlabel('epoch')

plt.legend(['train', 'test'], loc='upper left')

plt.show()

# summarize history for loss

plt.figure(figsize = (10,8))

plt.plot(history.history['loss'])

plt.plot(history.history['val_loss'])

plt.title('model loss')

plt.ylabel('loss')

plt.xlabel('epoch')

plt.legend(['train', 'test'], loc='upper left')

plt.show()

model_predict = model.predict_classes(X_test)

model_predict_df = pd.DataFrame(model_predict)

model_predict_df = model_predict_df.rename(columns={0: 'Labels'})

model_predict_df['Labels'] = model_predict_df['Labels'].astype('str')

model_predict_df['Labels'] = model_predict_df['Labels'].str.replace('1', 'Positive')

model_predict_df['Labels'] = model_predict_df['Labels'].str.replace('0', 'Negative')

model_predict_df.head()

import seaborn as sns

plt.figure(figsize = (10,6))

sns.countplot(model_predict_df['Labels'])

plt.title("Bar Plot of Predicted Sentiments",fontsize = 15)

plt.xlabel("Predicted Sentiment Labels",fontsize = 15)

plt.ylabel("Count",fontsize = 15)

plt.show()

model_predict_df['Labels'].value_counts()

from sklearn.metrics import classification_report

print(classification_report(y_test,model_predict))

Code Link : https://colab.research.google.com/drive/1GJOrno9Fvt24JAxMHbMnDm8pcGmBwdhs#scrollTo=KOOPmMJjSXup