In [146]:
## =======================================================
## IMPORTING
## =======================================================
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')
In [94]:
# all_df.to_csv('kaggle_0_not0.csv', index=False)
t_sm = train[train.columns[2:5]]
train['t0'] = train['Sentiment'][train['Sentiment'] == 0] = 'neg'
train['S2'] = train['Sentiment']
train['S2'][train['S2'] == 0] = 'neg'
train['S2'][train['S2'] == 1] = 'neg'
train['S2'][train['S2'] == 3] = 'pos'
train['S2'][train['S2'] == 4] = 'pos'
train_sm = pd.DataFrame()
train_sm['Phrase'] = train['Phrase']
train_sm['S2'] = train['S2']
train_sm


train.to_csv('kaggle_csv.csv')
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Out[94]:
Phrase S2
0 A series of escapades demonstrating the adage ... neg
1 A series of escapades demonstrating the adage ... 2
2 A series 2
3 A 2
4 series 2
... ... ...
156055 Hearst 's 2
156056 forced avuncular chortles neg
156057 avuncular chortles pos
156058 avuncular 2
156059 chortles 2

156060 rows × 2 columns

In [35]:
len(train_sm)
Out[35]:
156060
In [36]:
len(train_sm[train_sm['S2'] == 'neg'])
Out[36]:
34345
In [37]:
len(train_sm[train_sm['S2'] == 'pos'])
Out[37]:
42133
In [38]:
len(train_sm[train_sm['S2'] == 2])
Out[38]:
79582
In [45]:
ar = [34345, 42133, 42133]
import matplotlib.pyplot as plt

plt.barplot()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-45-1e325f98e22c> in <module>
      2 import matplotlib.pyplot as plt
      3 
----> 4 plt.barplot()

AttributeError: module 'matplotlib.pyplot' has no attribute 'barplot'
In [47]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


x = [1,2,3]

y = [4, 9, 2]
z = [1, 2, 3]
k = [11, 12, 13]

# number_of_bars
n_bars = 3

df = pd.DataFrame(zip(x*n_bars, ["y"]*len(x)+["z"]*len(x)+["k"]*len(x), y+z+k), columns=["time", "kind", "data"])
plt.figure(figsize=(10, 6))
sns.barplot(x="time", hue="kind", y="data", data=df)
plt.show()

df
Out[47]:
time kind data
0 1 y 4
1 2 y 9
2 3 y 2
3 1 z 1
4 2 z 2
5 3 z 3
6 1 k 11
7 2 k 12
8 3 k 13
In [62]:
ar = [34345, 42133, 42133]
import matplotlib.pyplot as plt

df = pd.DataFrame()
df['labels'] = ['n','p', '2']
df['nums'] = ar
df
sns.barplot(x='labels', y='nums',data = df)
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ec92b50>
In [68]:
train.to_csv('kaggle_csv.csv')

# sns.barplot(x='Sentiment', y=train['Sentiment'].count_(), data=train)
In [72]:
k = pd.read_csv('kaggle_csv.csv')
k
Out[72]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment t0 S2
0 0 1 1 A series of escapades demonstrating the adage ... 1 neg neg
1 1 2 1 A series of escapades demonstrating the adage ... 2 neg 2
2 2 3 1 A series 2 neg 2
3 3 4 1 A 2 neg 2
4 4 5 1 series 2 neg 2
... ... ... ... ... ... ... ...
156055 156055 156056 8544 Hearst 's 2 neg 2
156056 156056 156057 8544 forced avuncular chortles 1 neg neg
156057 156057 156058 8544 avuncular chortles 3 neg pos
156058 156058 156059 8544 avuncular 2 neg 2
156059 156059 156060 8544 chortles 2 neg 2

156060 rows × 7 columns

In [73]:
k2 = pd.DataFrame()
k2['Phrase'] = k['Phrase']
k2['S2'] = k['S2']
k2
Out[73]:
Phrase S2
0 A series of escapades demonstrating the adage ... neg
1 A series of escapades demonstrating the adage ... 2
2 A series 2
3 A 2
4 series 2
... ... ...
156055 Hearst 's 2
156056 forced avuncular chortles neg
156057 avuncular chortles pos
156058 avuncular 2
156059 chortles 2

156060 rows × 2 columns

In [79]:
k2 = k2.drop(k2[k2['S2'] == '2'].index)
In [88]:
k2['S2'][k2['S2'] == 'neg'] = 0
k2['S2'][k2['S2'] == 'pos'] = 1

# k2.to_csv('kaggle_pos1_neg0.csv')
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'S2'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-88-30bd4b9e8fbe> in <module>
----> 1 k2['S2'][k2['S2'] == 'neg'] = 0
      2 k2['S2'][k2['S2'] == 'pos'] = 1
      3 k2.columns = [0,'PoN']
      4 k2
      5 # k2.to_csv('kaggle_pos1_neg0.csv')

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2978             if self.columns.nlevels > 1:
   2979                 return self._getitem_multilevel(key)
-> 2980             indexer = self.columns.get_loc(key)
   2981             if is_integer(indexer):
   2982                 indexer = [indexer]

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'S2'
In [91]:
k2.columns = ['text','PoN']
k2
Out[91]:
text PoN
0 A series of escapades demonstrating the adage ... 0
21 good for the goose 1
22 good 1
33 the gander , some of which occasionally amuses... 0
46 amuses 1
... ... ...
156047 quietly suggesting the sadness and obsession b... 0
156051 sadness and obsession 0
156052 sadness and 0
156056 forced avuncular chortles 0
156057 avuncular chortles 1

76478 rows × 2 columns

In [92]:
k2.to_csv('kaggle_pos1_neg0.csv', index=False)
In [95]:
k
Out[95]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment t0 S2
0 0 1 1 A series of escapades demonstrating the adage ... 1 neg neg
1 1 2 1 A series of escapades demonstrating the adage ... 2 neg 2
2 2 3 1 A series 2 neg 2
3 3 4 1 A 2 neg 2
4 4 5 1 series 2 neg 2
... ... ... ... ... ... ... ...
156055 156055 156056 8544 Hearst 's 2 neg 2
156056 156056 156057 8544 forced avuncular chortles 1 neg neg
156057 156057 156058 8544 avuncular chortles 3 neg pos
156058 156058 156059 8544 avuncular 2 neg 2
156059 156059 156060 8544 chortles 2 neg 2

156060 rows × 7 columns

In [101]:
k = pd.read_csv('kaggle_csv.csv')
k['S0'] = k['Sentiment']
k['S0'][k['S0'] == 0] = 0
k['S0'][k['S0'] == 1] = 1
k['S0'][k['S0'] == 2] = 1
k['S0'][k['S0'] == 3] = 1
k['S0'][k['S0'] == 4] = 1
k
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
Out[101]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment t0 S2 S0
0 0 1 1 A series of escapades demonstrating the adage ... 1 neg neg 1
1 1 2 1 A series of escapades demonstrating the adage ... 2 neg 2 2
2 2 3 1 A series 2 neg 2 2
3 3 4 1 A 2 neg 2 2
4 4 5 1 series 2 neg 2 2
... ... ... ... ... ... ... ... ...
156055 156055 156056 8544 Hearst 's 2 neg 2 2
156056 156056 156057 8544 forced avuncular chortles 1 neg neg 1
156057 156057 156058 8544 avuncular chortles 3 neg pos 3
156058 156058 156059 8544 avuncular 2 neg 2 2
156059 156059 156060 8544 chortles 2 neg 2 2

156060 rows × 8 columns

In [105]:
k3 = pd.read_csv('kaggle_csv.csv')
k3
Out[105]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment t0 S2
0 0 1 1 A series of escapades demonstrating the adage ... 1 neg neg
1 1 2 1 A series of escapades demonstrating the adage ... 2 neg 2
2 2 3 1 A series 2 neg 2
3 3 4 1 A 2 neg 2
4 4 5 1 series 2 neg 2
... ... ... ... ... ... ... ...
156055 156055 156056 8544 Hearst 's 2 neg 2
156056 156056 156057 8544 forced avuncular chortles 1 neg neg
156057 156057 156058 8544 avuncular chortles 3 neg pos
156058 156058 156059 8544 avuncular 2 neg 2
156059 156059 156060 8544 chortles 2 neg 2

156060 rows × 7 columns

In [150]:
 
Out[150]:
156060
In [151]:
df[df['S0_0'] == 0]
Out[151]:
Phrase S0 S0_0
In [137]:
# (df['S0'] == 1).count()
(df['S0'] == 0).count()
Out[137]:
156060
In [138]:
len(df)
Out[138]:
156060
In [140]:
df['S0'].astype(int)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-140-d5c9ce8cbf4b> in <module>
----> 1 df['S0'].astype(int)

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
   5880             # else, only a single dtype is given
   5881             new_data = self._data.astype(
-> 5882                 dtype=dtype, copy=copy, errors=errors, **kwargs
   5883             )
   5884             return self._constructor(new_data).__finalize__(self)

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
    579 
    580     def astype(self, dtype, **kwargs):
--> 581         return self.apply("astype", dtype=dtype, **kwargs)
    582 
    583     def convert(self, **kwargs):

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
    436                     kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
    437 
--> 438             applied = getattr(b, f)(**kwargs)
    439             result_blocks = _extend_blocks(applied, result_blocks)
    440 

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
    557 
    558     def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
--> 559         return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
    560 
    561     def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
    641                     # _astype_nansafe works fine with 1-d only
    642                     vals1d = values.ravel()
--> 643                     values = astype_nansafe(vals1d, dtype, copy=True, **kwargs)
    644 
    645                 # TODO(extension)

~/anaconda3/lib/python3.7/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
    705         # work around NumPy brokenness, #1987
    706         if np.issubdtype(dtype.type, np.integer):
--> 707             return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
    708 
    709         # if we have a datetime/timedelta array of objects

pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

ValueError: invalid literal for int() with base 10: 'neg'
In [147]:
k3 = pd.read_csv('kaggle_csv.csv')
In [149]:
k3['Sentiment'].values.min()
Out[149]:
0
In [155]:
df = pd.DataFrame()
df['Phrase'] = k3['Phrase']
df['S0'] = k3['Sentiment']

df['S0_0'] = [0 if x == 0 else 1 for x in df['S0']]
len(df[df['S0_0'] == 1])
Out[155]:
148988
In [156]:
len(df[df['S0_0'] == 0])
Out[156]:
7072
In [154]:
df
Out[154]:
Phrase S0 S0_0
0 A series of escapades demonstrating the adage ... 1 1
1 A series of escapades demonstrating the adage ... 2 1
2 A series 2 1
3 A 2 1
4 series 2 1
... ... ... ...
156055 Hearst 's 2 1
156056 forced avuncular chortles 1 1
156057 avuncular chortles 3 1
156058 avuncular 2 1
156059 chortles 2 1

156060 rows × 3 columns

In [ ]: