## =======================================================
## IMPORTING
## =======================================================
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')

# all_df.to_csv('kaggle_0_not0.csv', index=False)
t_sm = train[train.columns[2:5]]
train['t0'] = train['Sentiment'][train['Sentiment'] == 0] = 'neg'
train['S2'] = train['Sentiment']
train['S2'][train['S2'] == 0] = 'neg'
train['S2'][train['S2'] == 1] = 'neg'
train['S2'][train['S2'] == 3] = 'pos'
train['S2'][train['S2'] == 4] = 'pos'
train_sm = pd.DataFrame()
train_sm['Phrase'] = train['Phrase']
train_sm['S2'] = train['S2']
train_sm


train.to_csv('kaggle_csv.csv')

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

len(train_sm)

156060

len(train_sm[train_sm['S2'] == 'neg'])

34345

len(train_sm[train_sm['S2'] == 'pos'])

42133

len(train_sm[train_sm['S2'] == 2])

79582

ar = [34345, 42133, 42133]
import matplotlib.pyplot as plt

plt.barplot()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-45-1e325f98e22c> in <module>
      2 import matplotlib.pyplot as plt
      3 
----> 4 plt.barplot()

AttributeError: module 'matplotlib.pyplot' has no attribute 'barplot'

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


x = [1,2,3]

y = [4, 9, 2]
z = [1, 2, 3]
k = [11, 12, 13]

# number_of_bars
n_bars = 3

df = pd.DataFrame(zip(x*n_bars, ["y"]*len(x)+["z"]*len(x)+["k"]*len(x), y+z+k), columns=["time", "kind", "data"])
plt.figure(figsize=(10, 6))
sns.barplot(x="time", hue="kind", y="data", data=df)
plt.show()

df

ar = [34345, 42133, 42133]
import matplotlib.pyplot as plt

df = pd.DataFrame()
df['labels'] = ['n','p', '2']
df['nums'] = ar
df
sns.barplot(x='labels', y='nums',data = df)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1ec92b50>

train.to_csv('kaggle_csv.csv')

# sns.barplot(x='Sentiment', y=train['Sentiment'].count_(), data=train)

k = pd.read_csv('kaggle_csv.csv')
k

k2 = pd.DataFrame()
k2['Phrase'] = k['Phrase']
k2['S2'] = k['S2']
k2

k2 = k2.drop(k2[k2['S2'] == '2'].index)

k2['S2'][k2['S2'] == 'neg'] = 0
k2['S2'][k2['S2'] == 'pos'] = 1

# k2.to_csv('kaggle_pos1_neg0.csv')

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'S2'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-88-30bd4b9e8fbe> in <module>
----> 1 k2['S2'][k2['S2'] == 'neg'] = 0
      2 k2['S2'][k2['S2'] == 'pos'] = 1
      3 k2.columns = [0,'PoN']
      4 k2
      5 # k2.to_csv('kaggle_pos1_neg0.csv')

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2978             if self.columns.nlevels > 1:
   2979                 return self._getitem_multilevel(key)
-> 2980             indexer = self.columns.get_loc(key)
   2981             if is_integer(indexer):
   2982                 indexer = [indexer]

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'S2'

k2.columns = ['text','PoN']
k2

k2.to_csv('kaggle_pos1_neg0.csv', index=False)

k

k = pd.read_csv('kaggle_csv.csv')
k['S0'] = k['Sentiment']
k['S0'][k['S0'] == 0] = 0
k['S0'][k['S0'] == 1] = 1
k['S0'][k['S0'] == 2] = 1
k['S0'][k['S0'] == 3] = 1
k['S0'][k['S0'] == 4] = 1
k

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys

k3 = pd.read_csv('kaggle_csv.csv')
k3

156060

df[df['S0_0'] == 0]

# (df['S0'] == 1).count()
(df['S0'] == 0).count()

156060

len(df)

156060

df['S0'].astype(int)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-140-d5c9ce8cbf4b> in <module>
----> 1 df['S0'].astype(int)

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
   5880             # else, only a single dtype is given
   5881             new_data = self._data.astype(
-> 5882                 dtype=dtype, copy=copy, errors=errors, **kwargs
   5883             )
   5884             return self._constructor(new_data).__finalize__(self)

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
    579 
    580     def astype(self, dtype, **kwargs):
--> 581         return self.apply("astype", dtype=dtype, **kwargs)
    582 
    583     def convert(self, **kwargs):

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
    436                     kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
    437 
--> 438             applied = getattr(b, f)(**kwargs)
    439             result_blocks = _extend_blocks(applied, result_blocks)
    440 

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
    557 
    558     def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
--> 559         return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
    560 
    561     def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
    641                     # _astype_nansafe works fine with 1-d only
    642                     vals1d = values.ravel()
--> 643                     values = astype_nansafe(vals1d, dtype, copy=True, **kwargs)
    644 
    645                 # TODO(extension)

~/anaconda3/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
    705         # work around NumPy brokenness, #1987
    706         if np.issubdtype(dtype.type, np.integer):
--> 707             return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
    708 
    709         # if we have a datetime/timedelta array of objects

pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

ValueError: invalid literal for int() with base 10: 'neg'

k3 = pd.read_csv('kaggle_csv.csv')

k3['Sentiment'].values.min()

0

df = pd.DataFrame()
df['Phrase'] = k3['Phrase']
df['S0'] = k3['Sentiment']

df['S0_0'] = [0 if x == 0 else 1 for x in df['S0']]
len(df[df['S0_0'] == 1])

148988

len(df[df['S0_0'] == 0])

7072

df

	Phrase	S2
0	A series of escapades demonstrating the adage ...	neg
1	A series of escapades demonstrating the adage ...	2
2	A series	2
3	A	2
4	series	2
...	...	...
156055	Hearst 's	2
156056	forced avuncular chortles	neg
156057	avuncular chortles	pos
156058	avuncular	2
156059	chortles	2

	Unnamed: 0	PhraseId	SentenceId	Phrase	Sentiment	t0	S2
0	0	1	1	A series of escapades demonstrating the adage ...	1	neg	neg
1	1	2	1	A series of escapades demonstrating the adage ...	2	neg	2
2	2	3	1	A series	2	neg	2
3	3	4	1	A	2	neg	2
4	4	5	1	series	2	neg	2
...	...	...	...	...	...	...	...
156055	156055	156056	8544	Hearst 's	2	neg	2
156056	156056	156057	8544	forced avuncular chortles	1	neg	neg
156057	156057	156058	8544	avuncular chortles	3	neg	pos
156058	156058	156059	8544	avuncular	2	neg	2
156059	156059	156060	8544	chortles	2	neg	2

	Phrase	S2
0	A series of escapades demonstrating the adage ...	neg
1	A series of escapades demonstrating the adage ...	2
2	A series	2
3	A	2
4	series	2
...	...	...
156055	Hearst 's	2
156056	forced avuncular chortles	neg
156057	avuncular chortles	pos
156058	avuncular	2
156059	chortles	2

	text	PoN
0	A series of escapades demonstrating the adage ...	0
21	good for the goose	1
22	good	1
33	the gander , some of which occasionally amuses...	0
46	amuses	1
...	...	...
156047	quietly suggesting the sadness and obsession b...	0
156051	sadness and obsession	0
156052	sadness and	0
156056	forced avuncular chortles	0
156057	avuncular chortles	1

	Unnamed: 0	PhraseId	SentenceId	Phrase	Sentiment	t0	S2
0	0	1	1	A series of escapades demonstrating the adage ...	1	neg	neg
1	1	2	1	A series of escapades demonstrating the adage ...	2	neg	2
2	2	3	1	A series	2	neg	2
3	3	4	1	A	2	neg	2
4	4	5	1	series	2	neg	2
...	...	...	...	...	...	...	...
156055	156055	156056	8544	Hearst 's	2	neg	2
156056	156056	156057	8544	forced avuncular chortles	1	neg	neg
156057	156057	156058	8544	avuncular chortles	3	neg	pos
156058	156058	156059	8544	avuncular	2	neg	2
156059	156059	156060	8544	chortles	2	neg	2

	time	kind	data
0	1	y	4
1	2	y	9
2	3	y	2
3	1	z	1
4	2	z	2
5	3	z	3
6	1	k	11
7	2	k	12
8	3	k	13