import pandas as pd
from nlpia.data.loaders import get_data
pd.options.display.width = 120
sms = get_data('sms-spam')
sms
2019-12-01 09:48:50,057 WARNING:nlpia.constants:107: <module> Starting logger in nlpia.constants...
spam | text | |
---|---|---|
0 | 0 | Go until jurong point, crazy.. Available only ... |
1 | 0 | Ok lar... Joking wif u oni... |
2 | 1 | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | 0 | U dun say so early hor... U c already then say... |
4 | 0 | Nah I don't think he goes to usf, he lives aro... |
... | ... | ... |
4832 | 1 | This is the 2nd time we have tried 2 contact u... |
4833 | 0 | Will ü b going to esplanade fr home? |
4834 | 0 | Pity, * was in mood for that. So...any other s... |
4835 | 0 | The guy did some bitching but I acted like i'd... |
4836 | 0 | Rofl. Its true to its name |
4837 rows × 2 columns
# Flagging spam with an exclamation mark
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
index
['sms0', 'sms1', 'sms2!', 'sms3', 'sms4', 'sms5!', 'sms6', 'sms7', 'sms8!', 'sms9!', 'sms10', 'sms11!', 'sms12!', 'sms13', 'sms14', 'sms15!', 'sms16', 'sms17', 'sms18', 'sms19!', 'sms20', 'sms21', 'sms22', 'sms23', 'sms24', 'sms25', 'sms26', 'sms27', 'sms28', 'sms29', 'sms30', 'sms31', 'sms32', 'sms33', 'sms34!', 'sms35', 'sms36', 'sms37', 'sms38', 'sms39', 'sms40', 'sms41', 'sms42!', 'sms43', 'sms44', 'sms45', 'sms46', 'sms47', 'sms48', 'sms49', 'sms50', 'sms51', 'sms52', 'sms53', 'sms54!', 'sms55', 'sms56!', 'sms57', 'sms58', 'sms59', 'sms60', 'sms61', 'sms62', 'sms63', 'sms64', 'sms65!', 'sms66', 'sms67!', 'sms68!', 'sms69', 'sms70', 'sms71', 'sms72', 'sms73', 'sms74', 'sms75', 'sms76', 'sms77', 'sms78', 'sms79', 'sms80', 'sms81', 'sms82', 'sms83', 'sms84', 'sms85', 'sms86', 'sms87', 'sms88', 'sms89', 'sms90', 'sms91', 'sms92', 'sms93!', 'sms94', 'sms95!', 'sms96', 'sms97', 'sms98', 'sms99', 'sms100', 'sms101', 'sms102', 'sms103', 'sms104', 'sms105', 'sms106', 'sms107', 'sms108', 'sms109', 'sms110', 'sms111', 'sms112', 'sms113', 'sms114!', 'sms115', 'sms116', 'sms117!', 'sms118', 'sms119', 'sms120!', 'sms121!', 'sms122', 'sms123!', 'sms124', 'sms125', 'sms126', 'sms127', 'sms128', 'sms129', 'sms130', 'sms131', 'sms132', 'sms133', 'sms134!', 'sms135!', 'sms136', 'sms137', 'sms138', 'sms139!', 'sms140', 'sms141', 'sms142', 'sms143', 'sms144', 'sms145', 'sms146', 'sms147!', 'sms148', 'sms149', 'sms150', 'sms151', 'sms152', 'sms153', 'sms154', 'sms155', 'sms156', 'sms157', 'sms158', 'sms159!', 'sms160!', 'sms161', 'sms162', 'sms163', 'sms164!', 'sms165!', 'sms166', 'sms167!', 'sms168', 'sms169', 'sms170', 'sms171', 'sms172', 'sms173', 'sms174', 'sms175', 'sms176', 'sms177', 'sms178', 'sms179', 'sms180', 'sms181', 'sms182', 'sms183', 'sms184', 'sms185', 'sms186', 'sms187', 'sms188!', 'sms189', 'sms190', 'sms191!', 'sms192', 'sms193', 'sms194', 'sms195', 'sms196', 'sms197', 'sms198', 'sms199', 'sms200', 'sms201', 'sms202', 'sms203', 'sms204', 'sms205', 'sms206', 'sms207', 'sms208', 'sms209', 'sms210', 'sms211', 'sms212', 'sms213', 'sms214', 'sms215', 'sms216', 'sms217', 'sms218', 'sms219', 'sms220', 'sms221', 'sms222', 'sms223', 'sms224', 'sms225!', 'sms226', 'sms227!', 'sms228', 'sms229', 'sms230', 'sms231', 'sms232', 'sms233', 'sms234', 'sms235!', 'sms236', 'sms237', 'sms238', 'sms239', 'sms240!', 'sms241', 'sms242', 'sms243', 'sms244', 'sms245', 'sms246', 'sms247', 'sms248', 'sms249', 'sms250!', 'sms251', 'sms252', 'sms253', 'sms254', 'sms255', 'sms256', 'sms257', 'sms258', 'sms259!', 'sms260', 'sms261', 'sms262', 'sms263', 'sms264!', 'sms265', 'sms266', 'sms267', 'sms268!', 'sms269', 'sms270!', 'sms271', 'sms272', 'sms273!', 'sms274', 'sms275', 'sms276', 'sms277', 'sms278', 'sms279', 'sms280', 'sms281', 'sms282', 'sms283', 'sms284', 'sms285', 'sms286', 'sms287', 'sms288', 'sms289', 'sms290', 'sms291', 'sms292!', 'sms293', 'sms294', 'sms295', 'sms296', 'sms297', 'sms298', 'sms299', 'sms300!', 'sms301', 'sms302', 'sms303', 'sms304', 'sms305', 'sms306', 'sms307', 'sms308!', 'sms309', 'sms310', 'sms311!', 'sms312', 'sms313', 'sms314', 'sms315', 'sms316', 'sms317', 'sms318', 'sms319', 'sms320', 'sms321', 'sms322', 'sms323', 'sms324!', 'sms325', 'sms326', 'sms327', 'sms328', 'sms329!', 'sms330', 'sms331', 'sms332', 'sms333', 'sms334', 'sms335', 'sms336', 'sms337', 'sms338', 'sms339', 'sms340', 'sms341', 'sms342!', 'sms343', 'sms344', 'sms345', 'sms346', 'sms347', 'sms348', 'sms349', 'sms350', 'sms351', 'sms352!', 'sms353', 'sms354!', 'sms355!', 'sms356', 'sms357', 'sms358', 'sms359', 'sms360', 'sms361', 'sms362!', 'sms363', 'sms364!', 'sms365', 'sms366!', 'sms367', 'sms368!', 'sms369', 'sms370', 'sms371', 'sms372', 'sms373', 'sms374', 'sms375', 'sms376', 'sms377', 'sms378!', 'sms379', 'sms380', 'sms381', 'sms382', 'sms383', 'sms384', 'sms385', 'sms386', 'sms387', 'sms388', 'sms389', 'sms390', 'sms391', 'sms392', 'sms393', 'sms394', 'sms395', 'sms396', 'sms397', 'sms398', 'sms399', 'sms400', 'sms401!', 'sms402', 'sms403', 'sms404', 'sms405', 'sms406', 'sms407', 'sms408', 'sms409', 'sms410', 'sms411', 'sms412', 'sms413!', 'sms414', 'sms415', 'sms416!', 'sms417', 'sms418', 'sms419', 'sms420!', 'sms421', 'sms422', 'sms423', 'sms424', 'sms425', 'sms426', 'sms427', 'sms428!', 'sms429!', 'sms430!', 'sms431', 'sms432', 'sms433', 'sms434', 'sms435!', 'sms436', 'sms437', 'sms438', 'sms439', 'sms440', 'sms441', 'sms442', 'sms443', 'sms444!', 'sms445', 'sms446', 'sms447', 'sms448!', 'sms449', 'sms450', 'sms451', 'sms452', 'sms453', 'sms454', 'sms455', 'sms456', 'sms457', 'sms458', 'sms459!', 'sms460', 'sms461', 'sms462', 'sms463!', 'sms464!', 'sms465', 'sms466', 'sms467', 'sms468', 'sms469', 'sms470', 'sms471', 'sms472!', 'sms473', 'sms474', 'sms475', 'sms476', 'sms477', 'sms478!', 'sms479!', 'sms480', 'sms481', 'sms482', 'sms483', 'sms484', 'sms485', 'sms486!', 'sms487', 'sms488!', 'sms489', 'sms490', 'sms491', 'sms492!', 'sms493', 'sms494!', 'sms495!', 'sms496', 'sms497', 'sms498', 'sms499!', 'sms500', 'sms501', 'sms502', 'sms503', 'sms504', 'sms505!', 'sms506', 'sms507', 'sms508', 'sms509', 'sms510', 'sms511!', 'sms512', 'sms513', 'sms514', 'sms515', 'sms516', 'sms517', 'sms518', 'sms519', 'sms520', 'sms521!', 'sms522', 'sms523!', 'sms524!', 'sms525!', 'sms526', 'sms527', 'sms528', 'sms529', 'sms530', 'sms531!', 'sms532', 'sms533!', 'sms534!', 'sms535!', 'sms536!', 'sms537', 'sms538', 'sms539', 'sms540', 'sms541', 'sms542', 'sms543', 'sms544', 'sms545', 'sms546', 'sms547', 'sms548', 'sms549', 'sms550', 'sms551', 'sms552', 'sms553', 'sms554', 'sms555!', 'sms556', 'sms557', 'sms558', 'sms559', 'sms560', 'sms561', 'sms562!', 'sms563', 'sms564', 'sms565!', 'sms566', 'sms567', 'sms568', 'sms569', 'sms570', 'sms571', 'sms572', 'sms573', 'sms574', 'sms575', 'sms576', 'sms577', 'sms578', 'sms579', 'sms580', 'sms581', 'sms582', 'sms583', 'sms584', 'sms585!', 'sms586', 'sms587', 'sms588', 'sms589', 'sms590!', 'sms591', 'sms592', 'sms593', 'sms594!', 'sms595', 'sms596', 'sms597', 'sms598!', 'sms599', 'sms600', 'sms601', 'sms602!', 'sms603', 'sms604', 'sms605', 'sms606', 'sms607', 'sms608', 'sms609', 'sms610!', 'sms611', 'sms612', 'sms613', 'sms614', 'sms615', 'sms616', 'sms617!', 'sms618', 'sms619', 'sms620', 'sms621', 'sms622', 'sms623', 'sms624', 'sms625', 'sms626', 'sms627', 'sms628', 'sms629', 'sms630', 'sms631', 'sms632', 'sms633', 'sms634', 'sms635', 'sms636', 'sms637', 'sms638!', 'sms639', 'sms640', 'sms641', 'sms642', 'sms643', 'sms644', 'sms645', 'sms646', 'sms647', 'sms648', 'sms649', 'sms650', 'sms651', 'sms652', 'sms653', 'sms654', 'sms655', 'sms656', 'sms657!', 'sms658', 'sms659', 'sms660', 'sms661', 'sms662!', 'sms663', 'sms664', 'sms665', 'sms666', 'sms667', 'sms668', 'sms669', 'sms670', 'sms671', 'sms672!', 'sms673', 'sms674', 'sms675', 'sms676', 'sms677!', 'sms678', 'sms679', 'sms680', 'sms681', 'sms682', 'sms683', 'sms684', 'sms685', 'sms686', 'sms687', 'sms688', 'sms689', 'sms690', 'sms691', 'sms692', 'sms693', 'sms694', 'sms695', 'sms696', 'sms697', 'sms698', 'sms699', 'sms700!', 'sms701', 'sms702', 'sms703!', 'sms704', 'sms705!', 'sms706', 'sms707', 'sms708', 'sms709', 'sms710', 'sms711', 'sms712', 'sms713', 'sms714', 'sms715!', 'sms716', 'sms717', 'sms718', 'sms719!', 'sms720', 'sms721', 'sms722', 'sms723', 'sms724!', 'sms725', 'sms726', 'sms727!', 'sms728!', 'sms729', 'sms730', 'sms731', 'sms732', 'sms733', 'sms734', 'sms735', 'sms736', 'sms737', 'sms738', 'sms739', 'sms740', 'sms741', 'sms742', 'sms743', 'sms744!', 'sms745', 'sms746!', 'sms747', 'sms748', 'sms749', 'sms750', 'sms751', 'sms752!', 'sms753', 'sms754', 'sms755', 'sms756', 'sms757', 'sms758', 'sms759', 'sms760!', 'sms761', 'sms762', 'sms763', 'sms764', 'sms765', 'sms766', 'sms767', 'sms768', 'sms769', 'sms770', 'sms771', 'sms772', 'sms773!', 'sms774', 'sms775!', 'sms776', 'sms777!', 'sms778', 'sms779', 'sms780', 'sms781!', 'sms782', 'sms783', 'sms784!', 'sms785', 'sms786', 'sms787', 'sms788', 'sms789', 'sms790', 'sms791', 'sms792!', 'sms793', 'sms794', 'sms795', 'sms796', 'sms797!', 'sms798', 'sms799', 'sms800', 'sms801!', 'sms802', 'sms803', 'sms804', 'sms805', 'sms806', 'sms807', 'sms808', 'sms809!', 'sms810', 'sms811', 'sms812', 'sms813', 'sms814', 'sms815', 'sms816', 'sms817', 'sms818!', 'sms819', 'sms820', 'sms821', 'sms822', 'sms823', 'sms824', 'sms825', 'sms826', 'sms827!', 'sms828', 'sms829', 'sms830', 'sms831', 'sms832', 'sms833', 'sms834', 'sms835', 'sms836', 'sms837', 'sms838', 'sms839', 'sms840', 'sms841', 'sms842', 'sms843', 'sms844', 'sms845', 'sms846', 'sms847', 'sms848', 'sms849!', 'sms850', 'sms851!', 'sms852', 'sms853', 'sms854', 'sms855', 'sms856', 'sms857', 'sms858', 'sms859', 'sms860!', 'sms861', 'sms862!', 'sms863', 'sms864', 'sms865', 'sms866', 'sms867', 'sms868', 'sms869', 'sms870', 'sms871', 'sms872!', 'sms873', 'sms874', 'sms875', 'sms876!', 'sms877', 'sms878', 'sms879', 'sms880!', 'sms881', 'sms882!', 'sms883', 'sms884!', 'sms885', 'sms886', 'sms887', 'sms888', 'sms889', 'sms890', 'sms891', 'sms892', 'sms893', 'sms894', 'sms895', 'sms896', 'sms897', 'sms898', 'sms899', 'sms900', 'sms901', 'sms902', 'sms903', 'sms904', 'sms905', 'sms906', 'sms907!', 'sms908', 'sms909', 'sms910', 'sms911', 'sms912', 'sms913', 'sms914', 'sms915', 'sms916', 'sms917', 'sms918!', 'sms919', 'sms920', 'sms921', 'sms922', 'sms923', 'sms924!', 'sms925', 'sms926', 'sms927', 'sms928', 'sms929!', 'sms930', 'sms931', 'sms932', 'sms933', 'sms934', 'sms935', 'sms936', 'sms937', 'sms938', 'sms939', 'sms940', 'sms941', 'sms942', 'sms943', 'sms944', 'sms945', 'sms946', 'sms947', 'sms948', 'sms949', 'sms950', 'sms951', 'sms952', 'sms953', 'sms954', 'sms955', 'sms956', 'sms957', 'sms958', 'sms959', 'sms960', 'sms961', 'sms962!', 'sms963!', 'sms964', 'sms965', 'sms966', 'sms967', 'sms968', 'sms969', 'sms970', 'sms971', 'sms972', 'sms973!', 'sms974', 'sms975', 'sms976', 'sms977', 'sms978', 'sms979', 'sms980', 'sms981', 'sms982', 'sms983!', 'sms984', 'sms985', 'sms986', 'sms987', 'sms988', 'sms989', 'sms990', 'sms991', 'sms992', 'sms993', 'sms994', 'sms995', 'sms996', 'sms997', 'sms998', 'sms999', ...]
sms.spam
0 0 1 0 2 1 3 0 4 0 .. 4832 1 4833 0 4834 0 4835 0 4836 0 Name: spam, Length: 4837, dtype: int64
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(int)
len(sms)
sms.spam.sum()
sms.head(6)
spam | text | |
---|---|---|
sms0 | 0 | Go until jurong point, crazy.. Available only ... |
sms1 | 0 | Ok lar... Joking wif u oni... |
sms2! | 1 | Free entry in 2 a wkly comp to win FA Cup fina... |
sms3 | 0 | U dun say so early hor... U c already then say... |
sms4 | 0 | Nah I don't think he goes to usf, he lives aro... |
sms5! | 1 | FreeMsg Hey there darling it's been 3 week's n... |
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
tfidf_docs = tfidf_model.fit_transform(raw_documents = sms.text).toarray()
print(tfidf_docs.shape)
sms.spam.sum()
(4837, 9232)
638
# select only spam/ham
mask = sms.spam.astype(bool).values
mask
array([False, False, True, ..., False, False, False])
# calculating each column independently using axis = 0
spam_centroid = tfidf_docs[mask].mean(axis=0)
spam_centroid
array([0.06377591, 0.0041675 , 0.00056204, ..., 0. , 0. , 0. ])
ham_centroid = tfidf_docs[~mask].mean(axis=0)
ham_centroid
array([1.98493115e-02, 6.09435187e-03, 1.77747817e-04, ..., 6.31869803e-05, 6.31869803e-05, 6.31869803e-05])
# Get the line between them by subtracting one from the other
spamminess_score = tfidf_docs.dot(spam_centroid - ham_centroid)
ss = spamminess_score.round(2)
ss
array([-0.01, -0.02, 0.04, ..., -0.01, -0. , 0. ])
from sklearn.preprocessing import MinMaxScaler
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1,1))
sms['lda_predict'] = (sms.lda_score > .5).astype(int)
sms['spam lda_predict lda_score'.split()].round(2).head(6)
spam | lda_predict | lda_score | |
---|---|---|---|
sms0 | 0 | 0 | 0.23 |
sms1 | 0 | 0 | 0.18 |
sms2! | 1 | 1 | 0.72 |
sms3 | 0 | 0 | 0.18 |
sms4 | 0 | 0 | 0.29 |
sms5! | 1 | 1 | 0.55 |
from pugnlp.stats import Confusion
Confusion(sms['spam lda_predict'.split()])
/Users/danielcaraway/anaconda3/envs/nlpiaenv/lib/python3.6/site-packages/pugnlp/stats.py:504: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access self.__setattr__('_hist_labels', self.sum().astype(int)) /Users/danielcaraway/anaconda3/envs/nlpiaenv/lib/python3.6/site-packages/pugnlp/stats.py:510: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access setattr(self, '_hist_classes', self.T.sum())
lda_predict | 0 | 1 |
---|---|---|
spam | ||
0 | 4135 | 64 |
1 | 45 | 593 |
(1. - (sms.spam - sms.lda_predict).abs().sum() / len(sms)).round(3)
0.977
## =======================================================
## IMPORTING
## =======================================================
import os
import pandas as pd
from nlpia.data.loaders import get_data
pd.options.display.width = 120
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file, encoding = "ISO-8859-1")
results.append(f.read())
f.close()
return results
sms = get_data('sms-spam')
hw8 = get_data_from_files('110/110-f-d/')
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-15-0f1ff1b26e44> in <module> 1 sms = get_data('sms-spam') ----> 2 hw8 = get_data_from_files('110/110-f-d/') <ipython-input-14-aca3d0b6665d> in get_data_from_files(path) 8 9 def get_data_from_files(path): ---> 10 directory = os.listdir(path) 11 results = [] 12 for file in directory: FileNotFoundError: [Errno 2] No such file or directory: '110/110-f-d/'
from nlpia.book.examples.ch04_catdog_lsa_3x6x16\
import word_topic_vectors
word_topic_vectors.T.round(1)
word_topic_vectors
from nlpia.book.examples.ch04_catdog_lsa_sorted\
import lsa_models, prettify_tdm
bow_svd, tfidf_svd = lsa_models()
prettify_tdm(**bow_svd)
tdm = bow_svd['tdm']
tdm
# import numpy as np
# U, s, Vt = np.linalg.svd(tdm)
# import pandas as pd
# pd.DataFrame(U, index=tdm.index).round(2)
import numpy as np
U, s, Vt = np.linalg.svd(tdm)
import pandas as pd
pd.DataFrame(U, index = tdm.index).round(2)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-16-11810bd0e083> in <module> 5 6 import numpy as np ----> 7 U, s, Vt = np.linalg.svd(tdm) 8 import pandas as pd 9 pd.DataFrame(U, index = tdm.index).round(2) NameError: name 'tdm' is not defined
cd = get_data('cats_and_dogs_sorted')
100%|██████████| 263/263 [00:00<00:00, 44447.66it/s]
cd
array(['NYC is the Big Apple.', 'NYC is known as the Big Apple.', 'I love NYC!', 'I wore a hat to the Big Apple party in NYC.', 'Come to NYC. See the Big Apple!', 'Manhattan is called the Big Apple.', 'New York is a big city for a small cat.', 'The lion, a big cat, is the king of the jungle.', 'I love my pet cat.', 'I love New York City (NYC).', 'Your dog chased my cat.', 'Bright lights, big city?', "Simba, in Lion King, was inspired by Bambi who wasn't even a cat.", 'Does your dog have a dog house?', 'The cat steered clear of the dog house.', 'I love turtles.', 'Bengi was a small stray dog with a fluffy tan spotted coat.', 'The woman flew to NYC with her cat.', 'That dog is a big animal. He must eat a lot.', 'How big is New York? Is it a big city?', 'The dog ran through Central Park in NYC.', 'Where is NYC?', "The Cat's Meow", 'The dog sat on the floor.', 'The cat chased a mouse.', 'Dogs and cats love raw meat.', 'The cat never made eye contact.', 'The cat chased my laser pointer.', 'I pet the cat.', 'The cat died.', 'A dog chased the car, barking.', 'Mom and Mormor both love turtles.', 'The cat ate the bearded dragon.', 'A cat burglar stole my pets.', 'I chased your dog.', 'The lion tamer rode on top of the clown car into the ring carrying a chair.', 'NYC is a city that never sleeps.', 'I was in the dog house last night.', 'The cat in the window', 'He refused to sleep in the dog house.', 'Dogs love to smell the air rushing by in a car.', 'Your cat is cute.', 'A cat meowed on the hot tin roof', 'A dog chased my bike and barked loudly.', 'The dog ate my orchids.', 'The dog dropped the ball at my feet.', 'Dogs love to chase cars, trucks, and bikes.', 'So I went to NYC to be born again.', 'The man raised the lion from a cub and they still frolic in the jungle.', 'Wolves, dogs, and puppies love to play chase.', 'The dog rescued me from a hairy . ', 'He makes a good guard dog.', 'The dog walked up and sniffed my leg.', 'The cat hated getting in the car.', "The cat licked it's fur.", 'The black cat crossed my path.', 'The post man likes our dog.', 'The dog likes a scratch behind his ear.', 'He pet the dog on the head.', 'Mom loves to walk through NYC.', 'Rascal was a tabby cat.', 'NYC is the only city where you can hardly find a typical American.', 'My cat has long hair.', 'She was an alley cat.', 'Can your dog do tricks?', 'Fido chased the cat up the alley.', 'The cat chased a speedy rat.', "Dogs don't have much room to run in a big city.", 'The woman took her dog on the plane.', 'Is NYC a city or a way of life?', 'My dog is a good boy most of the time.', 'The cat coughed up a hair ball.', 'The cat died at the vet.', 'The fireman rescued the cat in the tree.', "Cat, you ruined mom's dress!", 'The dog whined until I pet its head.', 'The dog chased the ball and caught it.', 'The dog jumped up on the bed.', 'I ran from the dog and jumped the bench.', 'Ashley had a wiener dog that she took boating up the river.', 'He took his life in his hands, j-walking in NYC.', 'A stray cat played with the injured frog.', 'The lion opened his mouth wide as she put her head into his mouth.', 'Bengi was a movie about the adventures of a lovable stray dog.', 'The tabby cat had a fluffy tail.', 'The cat likes a scratch under her chin.', 'The dog flew down the street after the bike.', 'The man had a cat in his carry-on.', 'The cat crossed the lane and then the sidewalk.', 'Moon got mauled in a fight with an alley cat.', 'A black cat crossed the sidewalk in front of me.', "The cat held the lizard's tail in its mouth.", 'The raccoons ate all the cat food in the garage.', 'The dog wash was just a hose and they hated it.', 'There are no lions in NYC, but there are lots of house cats.', 'The cat meowed and I pet it until it purred.', 'Rascal was an alley cat before she became a Lane pet.', 'A car struck the cat and we took it to the vet.', 'A cat pounced at the lizard but came away with only its tail.', "An old dog can learn new tricks if there's food involved.", 'Our Bengi was a mottled tan dog that loved to run around the yard.', "Animals don't drive cars, but my pet dog likes to stick his head out the window.", 'The Cat in the Hat is not about an animal or a hat.', 'Ursa was smart and deceptive.', 'Are there fish in your fountain?', 'Australian sheep dogs are smarter than the sheep.', 'Are there fish in the pond?', 'Are there turtles in your pond?', 'How many pets do you have at home?', 'Where do you keep a turtle in your house?', 'Carnivore cunning and cooperation makes them smarter than herbivores.', 'No lone wolf would dare attack a lone moose or adult caribou.', 'I loved frogs and the color green.', 'Wolf puppies play with crows.', 'Crows help wolves track down prey and wolves share the kill.', 'Toxoplasmosis will change your mind.', 'America is littered with Toxoplasmosis.', 'Billy never had any pets at his house.', 'Ursa would inch her way into the dining room sheepishly.', 'Walter and Moon taught me how to crack video games.', 'Mice get attracted to the smell of cats when the have Toxoplasmosis.', "No man is an island, unless he's a lone wolf like Walter Anderson.", 'Alligators and wolves compete for food on Horn Island.', 'He use duck tape to keep its mouth closed.', 'The ranger dragged the alligator over the seawall.', 'What about frogs in the pond?', 'Clinton helped Clayton catch the alligator.', 'The Inner Harbor had an alligator, some turtles and lots of fish.', 'An alligator ate several pets and ducks before Clayton caught him.', 'Can lizards swim under water?', 'Char drooled with Pavlovian delight at the hotdog in my hand.', "Rascal hated the car because it's associated with the vet.", 'She caught a frog with her paw.', 'She bit the frog with her teeth.', 'Berk, the vet, has ideas about sports games for people.', "You don't get a fever from Toxoplasmosis, you just get aggressive.", 'Bear loved to hang his head out the truck window.', 'Wild cats chase bikes and runners but not cars or trucks.', 'Ursa used to chase her tail when she was young.', 'Lizards, turtles, and alligators are kind-of green and slimy.', "The dogs licked my plate so I didn't have to wash it.", 'Dogs wag their tail when they are happy.', 'Ursa, a black lab, would beat her tail against the wall until it was raw.', 'Bear lapped water from the hose with his tongue.', 'Rascal lapped milk from her bowl, curling her tongue.', 'Bear was bloody and panting after mauling the goats and sheep.', 'Will cuddled with Moon and Zoe on the couch.', 'Char and Ursa played on the green grass in the back yard.', 'Rascal was a stray when we found her in a tree in the back yard.', 'A black kitten crossed the road dodging cars like Frogger.', 'Goats and sheep make great lawn mowers for a boat yard.', 'Bear lapped out of the truck window.', 'The Inner Harbor was our playground.', 'The dogs were not allowed on the couch or in the dining room.', 'Lane rescued Moon with Will power.', 'Men become more gullible once they get Toxoplasmosis.', 'Wolves eat deer and stay away from sheep if they smell humans.', 'Ants get a virus that makes crawl to the tip of a blade of grass.', 'Brian wanted to start an alligator farm.', 'Humans harbor infectious diseases from domestic pets.', 'Our neighbors raised baby alligators.', 'Early humans slept in the barn with domesticated animals.', 'Sheep and deer eat grass.', 'My brother had an aquarium.', 'Some lizards can grow a new tail.', 'Mormor loved turtles and had them all over her house.', 'The litter box smells.', 'The snapping turtle won the race.', 'Jupiter and Moon each had their own food bowl.', 'A turtle beat the rabbit in a race.', 'Cats and dogs sleeping together.', 'Women become more trusting once they get Toxoplasmosis.', "It's Berk a vet?", 'I froze as he sniffed.', 'I rode my bike home.', 'Mom loves to walk around Manhattan.', 'Give me such shows — give me the streets of Manhattan!', 'She loves dogs.', 'Is that a pet rat in your carry-on?', 'He put a hat on his head.', 'He put his hat in the overhead bin.', 'The car is in the garage.', 'Dogs like to chase cars.', 'The car had a bike rack.', 'Marc steered my bike into a parked car.', 'A cute kitten played with its mother.', 'Where do you live little guy?', 'Where did you come from?', 'You sure are cute.', 'Go lie down on your bed.', 'Be a good boy and go on home now.', "That's a good boy.", 'It rained cats and dogs.', 'She keeps a clean house.', 'She took the train into the city to see the ball drop.', 'The snake chased the rat.', 'Rascal taught me empathy and care.', "Snakes aren't usually considered pets.", "She doesn't like hats in the car but I do.", 'She rode her bike though central park wearing a hat.', "Animals, including pets, don't like riding in cars.", "Cats don't like riding into the city in a car.", 'The rat ran into a hole in the back.', 'The rat ate a hole in his hat.', 'How many dogs are in the city?', 'Where is Soho? In New York City?', 'Look at me! Look at me! Look at me NOW!', 'It is fun to have fun. But you have to know how.', 'Honey, it was ruined when she bought it.', '"He should not be here," said the fish in the pot. ', '"He should not be here when your mother is not.” ', 'Speedy was too fast for Sylvester.', 'You are an animal.', "You're an animal!", 'Animals are not allowed on this flight.', 'Some flights allow animals in carry-ons.', "Snakes aren't usually allowed on planes.", 'The litter box is in the back of the house.', 'Do all dogs go to heaven?', 'Cats and dogs playing together.', 'Kittens are cute.', "Jupiter's hair stood on end.", 'The lizard aquarium was moist.', 'Turtles need water.', 'Cats hat water.', 'I chased the ferret with a water pistol.', 'The ferret got struck by lightening.', 'We have a car carrier.', 'Algernon lost his mind.', 'Flowers for Algernon is my favorite book.', 'The puppy played in the flower bed.', 'She brought me flowers.', 'I kept the compost full of worms and the flowers bloomed.', 'Her orchids and my Amaryllis bloomed the same day.', 'Char are the flowers.', 'Marc chased rascal with a squirt gun.', 'Rascal hid in the Cypress tree.', 'I cried at the end of Algernon.', 'Algernon taught me about animal consciousness, smarts.', 'Books taught me how to read people.', 'Moon leapt into my lap.', 'I want to be reborn as a Lane pet.', 'The giving tree gave out.', 'Do you have a pet?', "That's a cute kitten.", 'Old dogs can learn tricks.', 'Sit Ubu, sit.', 'Sit Char, sit.', 'Sit Bear, sit.', 'I flew a kite.', 'It rained cats and dogs.', 'Do dogs go to heaven?', 'What kind of pet do you have?', 'Ursa ran a squirrel up the tree.', 'The catbird seat', 'Lindstrom pets are spoiled.', "I painted Turtle's shell with nail polish.", 'I named my pet rock Rocky.', 'Are you a vet?', 'My flowers are blooming.', "A single flower grew in Benji's grave.", 'Char chased the squirrel.', 'I gnawed the frog legs with my teeth.'], dtype=object)
data = pd.DataFrame(cd)
data.to_csv('cats_and_dogs_sorted.csv')
sms = get_data('sms-spam')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf = TfidfVectorizer(tokenizer = casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents = sms.text).toarray
len(tfidf.vocabulary_)
9232
tfidf
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.float64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, stop_words=None, strip_accents=None, sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=<function casual_tokenize at 0x12c083b70>, use_idf=True, vocabulary=None)
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
len(tfidf.vocabulary_)
tfidf_docs_df = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape
sms.spam.sum()
638
from sklearn.decomposition import PCA
pca = PCA(n_components = 16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns = columns, index = index)
pca_topic_vectors.round(3).head(6)
topic0 | topic1 | topic2 | topic3 | topic4 | topic5 | topic6 | topic7 | topic8 | topic9 | topic10 | topic11 | topic12 | topic13 | topic14 | topic15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sms0 | 0.201 | 0.003 | 0.037 | 0.011 | -0.019 | -0.053 | 0.039 | -0.065 | 0.013 | -0.086 | 0.005 | -0.012 | 0.010 | -0.031 | 0.003 | 0.034 |
sms1 | 0.404 | -0.094 | -0.077 | 0.051 | 0.100 | 0.047 | 0.023 | 0.065 | 0.025 | -0.021 | -0.003 | -0.017 | 0.040 | -0.026 | 0.036 | -0.035 |
sms2! | -0.030 | -0.048 | 0.090 | -0.067 | 0.091 | -0.043 | -0.000 | -0.001 | -0.060 | 0.048 | 0.127 | -0.025 | 0.040 | -0.019 | -0.041 | 0.053 |
sms3 | 0.329 | -0.033 | -0.035 | -0.016 | 0.052 | 0.056 | -0.165 | -0.075 | 0.066 | -0.106 | 0.023 | -0.020 | 0.072 | -0.042 | 0.018 | -0.093 |
sms4 | 0.002 | 0.031 | 0.038 | 0.034 | -0.075 | -0.093 | -0.044 | 0.061 | -0.048 | 0.024 | 0.027 | -0.010 | 0.032 | 0.050 | -0.068 | -0.015 |
sms5! | -0.016 | 0.059 | 0.014 | -0.006 | 0.122 | -0.040 | 0.005 | 0.167 | -0.021 | 0.068 | 0.042 | -0.039 | -0.045 | 0.076 | 0.037 | -0.008 |
# pca.components_
# sms
tfidf.vocabulary_
{'go': 3807, 'until': 8487, 'jurong': 4675, 'point': 6296, ',': 13, 'crazy': 2549, '..': 21, 'available': 1531, 'only': 5910, 'in': 4396, 'bugis': 1973, 'n': 5594, 'great': 3894, 'world': 8977, 'la': 4811, 'e': 3056, 'buffet': 1971, '...': 25, 'cine': 2277, 'there': 8071, 'got': 3855, 'amore': 1296, 'wat': 8736, 'ok': 5874, 'lar': 4848, 'joking': 4642, 'wif': 8875, 'u': 8395, 'oni': 5906, 'free': 3604, 'entry': 3195, '2': 471, 'a': 1054, 'wkly': 8933, 'comp': 2386, 'to': 8192, 'win': 8890, 'fa': 3328, 'cup': 2608, 'final': 3450, 'tkts': 8180, '21st': 497, 'may': 5272, '2005': 487, '.': 15, 'text': 8020, '87121': 948, 'receive': 6688, 'question': 6574, '(': 9, 'std': 7651, 'txt': 8379, 'rate': 6628, ')': 10, 't': 7889, '&': 7, "c's": 2020, 'apply': 1383, '08452810075': 115, 'over': 6003, '18': 438, "'": 8, 's': 6959, 'dun': 3041, 'say': 7034, 'so': 7438, 'early': 3069, 'hor': 4207, 'c': 2019, 'already': 1268, 'then': 8065, 'nah': 5606, 'i': 4311, "don't": 2948, 'think': 8092, 'he': 4048, 'goes': 3819, 'usf': 8537, 'lives': 5004, 'around': 1435, 'here': 4104, 'though': 8111, 'freemsg': 3613, 'hey': 4116, 'darling': 2666, "it's": 4535, 'been': 1693, '3': 591, "week's": 8788, 'now': 5784, 'and': 1310, 'no': 5732, 'word': 8967, 'back': 1584, '!': 0, "i'd": 4312, 'like': 4954, 'some': 7454, 'fun': 3677, 'you': 9158, 'up': 8489, 'for': 3552, 'it': 4533, 'still': 7674, '?': 1037, 'tb': 7955, 'xxx': 9097, 'chgs': 2230, 'send': 7127, '£': 9216, '1.50': 344, 'rcv': 6641, 'even': 3240, 'my': 5584, 'brother': 1942, 'is': 4519, 'not': 5769, 'speak': 7529, 'with': 8918, 'me': 5281, 'they': 8083, 'treat': 8312, 'aids': 1214, 'patent': 6106, 'as': 1452, 'per': 6148, 'your': 9171, 'request': 6796, 'melle': 5315, 'oru': 5968, 'minnaminunginte': 5386, 'nurungu': 5807, 'vettam': 8599, 'has': 4022, 'set': 7154, 'callertune': 2047, 'all': 1253, 'callers': 2046, 'press': 6418, '*': 11, '9': 982, 'copy': 2489, 'friends': 3634, 'winner': 8900, 'valued': 8569, 'network': 5678, 'customer': 2620, 'have': 4036, 'selected': 7113, 'receivea': 6689, '900': 986, 'prize': 6450, 'reward': 6851, 'claim': 2283, 'call': 2038, '09061701461': 263, 'code': 2344, 'kl341': 4771, 'valid': 8565, '12': 384, 'hours': 4226, 'had': 3965, 'mobile': 5441, '11': 371, 'months': 5484, 'or': 5946, 'more': 5489, 'r': 6590, 'entitled': 3192, 'update': 8495, 'the': 8052, 'latest': 4862, 'colour': 2364, 'mobiles': 5442, 'camera': 2058, 'co': 2333, 'on': 5897, '08002986030': 99, "i'm": 4314, 'gonna': 3834, 'be': 1669, 'home': 4176, 'soon': 7483, 'want': 8715, 'talk': 7921, 'about': 1076, 'this': 8100, 'stuff': 7741, 'anymore': 1350, 'tonight': 8235, 'k': 4683, "i've": 4316, 'cried': 2566, 'enough': 3182, 'today': 8199, 'six': 7341, 'chances': 2172, 'cash': 2116, 'from': 3652, '100': 354, '20,000': 482, 'pounds': 6357, '>': 1035, 'csh': 2584, '87575': 952, 'cost': 2501, '150p': 415, '/': 27, 'day': 2683, '6days': 827, '16': 431, '+': 12, 'tsandcs': 8344, 'reply': 6788, 'hl': 4148, '4': 659, 'info': 4433, 'urgent': 8513, 'won': 8950, '1': 337, 'week': 8787, 'membership': 5321, 'our': 5980, '100,000': 355, 'jackpot': 4564, ':': 1006, '81010': 900, 'www.dbuk.net': 9039, 'lccltd': 4880, 'pobox': 6286, '4403ldnw1a7rw18': 696, 'searching': 7081, 'right': 6863, 'words': 8968, 'thank': 8037, 'breather': 1912, 'promise': 6487, 'wont': 8958, 'take': 7913, 'help': 4089, 'granted': 3883, 'will': 8887, 'fulfil': 3673, 'wonderful': 8955, 'blessing': 1802, 'at': 1488, 'times': 8158, 'date': 2675, 'sunday': 7805, 'xxxmobilemovieclub': 9098, 'use': 8531, 'credit': 2556, 'click': 2306, 'wap': 8719, 'link': 4977, 'next': 5696, 'message': 5340, 'http://wap': 4259, 'xxxmobilemovieclub.com': 9099, '=': 1031, 'qjkgighjjgcbl': 6566, 'oh': 5869, 'watching': 8743, ':)': 1008, 'eh': 3116, 'remember': 6755, 'how': 4233, 'spell': 7545, 'his': 4139, 'name': 5612, 'yes': 9137, 'did': 2823, 'v': 8553, 'naughty': 5635, 'make': 5193, 'wet': 8828, 'fine': 3458, 'if': 4350, 'that': 8045, '\x92': 9211, 'way': 8753, 'feel': 3400, 'its': 4546, 'gota': 3856, 'b': 1560, 'england': 3173, 'macedonia': 5156, '-': 14, 'dont': 2952, 'miss': 5402, 'goals': 3812, 'team': 7968, 'news': 5692, 'ur': 8510, 'national': 5629, '87077': 947, 'eg': 3109, 'try': 8340, 'wales': 8695, 'scotland': 7060, '4txt': 737, 'ú1': 9220, '20': 481, 'poboxox': 6287, '36504w45wq': 629, 'seriously': 7147, '‘': 9225, 'm': 5139, 'going': 3823, 'ha': 3961, 'ü': 9221, 'pay': 6119, 'first': 3476, 'when': 8840, 'da': 2639, 'stock': 7678, 'comin': 2376, 'aft': 1182, 'finish': 3462, 'lunch': 5121, 'str': 7702, 'down': 2974, 'lor': 5058, 'ard': 1410, 'smth': 7422, 'ffffffffff': 3420, 'alright': 1269, 'can': 2062, 'meet': 5303, 'sooner': 7485, 'just': 4677, 'forced': 3554, 'myself': 5591, 'eat': 3081, 'slice': 7372, 'really': 6670, 'hungry': 4287, 'tho': 8107, 'sucks': 7778, 'mark': 5230, 'getting': 3767, 'worried': 8981, 'knows': 4782, 'sick': 7289, 'turn': 8362, 'pizza': 6238, 'lol': 5035, 'always': 1279, 'convincing': 2476, 'catch': 2128, 'bus': 1993, 'are': 1411, 'frying': 3660, 'an': 1305, 'egg': 3111, 'tea': 7962, 'eating': 3084, "mom's": 5463, 'left': 4901, 'dinner': 2858, 'do': 2909, 'love': 5081, "we're": 8760, 'packing': 6032, 'car': 2085, "i'll": 4313, 'let': 4923, 'know': 4779, "there's": 8074, 'room': 6906, 'ahhh': 1209, 'work': 8970, 'vaguely': 8560, 'what': 8832, 'does': 2923, 'wait': 8689, "that's": 8048, 'clear': 2300, 'were': 8817, 'sure': 7832, 'being': 1713, 'sarcastic': 7010, 'why': 8868, 'x': 9076, "doesn't": 2926, 'live': 5000, 'us': 8525, 'yeah': 9125, 'was': 8728, 'apologetic': 1371, 'fallen': 3352, 'out': 5983, 'she': 7199, 'actin': 1123, 'spoilt': 7572, 'child': 2238, 'caught': 2132, 'till': 8152, 'but': 1999, 'we': 8757, "won't": 8951, 'doing': 2938, 'too': 8242, 'badly': 1589, 'cheers': 2214, 'tell': 7985, 'anything': 1356, 'fear': 3391, 'of': 5847, 'fainting': 3344, 'housework': 4231, 'quick': 6577, 'cuppa': 2610, 'thanks': 8038, 'subscription': 7766, 'ringtone': 6872, 'uk': 8415, 'charged': 2184, '5': 745, 'month': 5479, 'please': 6265, 'confirm': 2431, 'by': 2016, 'replying': 6790, 'yup': 9192, 'look': 5046, 'timings': 8162, 'msg': 5528, 'again': 1190, 'xuhui': 9093, 'learn': 4892, '2nd': 567, 'her': 4099, 'lesson': 4921, '8am': 974, 'oops': 5921, "roommate's": 6909, 'done': 2950, 'see': 7098, 'letter': 4926, 'decide': 2711, 'hello': 4084, "how's": 4235, 'saturday': 7024, 'texting': 8027, "you'd": 9159, 'decided': 2712, 'tomo': 8224, 'trying': 8342, 'invite': 4493, 'pls': 6273, 'ahead': 1208, 'watts': 8751, 'wanted': 8716, 'weekend': 8791, 'abiola': 1072, 'forget': 3560, 'need': 5654, 'crave': 2546, 'most': 5499, 'sweet': 7863, 'arabian': 1407, 'steed': 7658, 'mmmmmm': 5431, 'yummy': 9187, '07732584351': 62, 'rodger': 6895, 'burns': 1990, 'tried': 8321, 're': 6645, 'sms': 7416, 'nokia': 5744, 'camcorder': 2056, '08000930705': 95, 'delivery': 2750, 'tomorrow': 8226, 'who': 8859, 'seeing': 7101, 'hope': 4198, 'man': 5203, 'well': 8807, 'endowed': 3163, 'am': 1281, '<#>': 1024, 'inches': 4401, 'calls': 2053, 'messages': 5344, 'missed': 5405, "didn't": 2828, 'get': 3760, 'hep': 4098, 'immunisation': 4379, 'nigeria': 5708, 'fair': 3345, 'hopefully': 4201, 'tyler': 8389, "can't": 2063, 'could': 2511, 'maybe': 5274, 'ask': 1463, 'bit': 1779, 'stubborn': 7730, 'hospital': 4214, 'kept': 4730, 'telling': 7986, 'weak': 8762, 'sucker': 7776, 'hospitals': 4215, 'suckers': 7777, 'thinked': 8093, 'time': 8154, 'saw': 7033, 'class': 2292, 'gram': 3875, 'usually': 8543, 'runs': 6949, 'half': 3977, 'eighth': 3119, 'smarter': 7395, 'gets': 3763, 'almost': 1264, 'whole': 8862, 'second': 7085, 'fyi': 3693, 'ride': 6862, 'morning': 5493, "he's": 4050, 'crashing': 2545, 'place': 6240, 'wow': 8997, 'never': 5683, 'realized': 6668, 'embarassed': 3144, 'accomodations': 1103, 'thought': 8112, 'liked': 4955, 'since': 7314, 'best': 1733, 'seemed': 7105, 'happy': 4011, '"': 1, 'cave': 2136, 'sorry': 7494, 'give': 3788, 'offered': 5855, 'embarassing': 3145, 'ac': 1089, 'sptv': 7594, 'new': 5687, 'jersey': 4608, 'devils': 2803, 'detroit': 2797, 'red': 6711, 'wings': 8898, 'play': 6255, 'ice': 4330, 'hockey': 4161, 'correct': 2494, 'incorrect': 4412, 'end': 3158, 'mallika': 5202, 'sherawat': 7208, 'yesterday': 9141, 'find': 3455, '@': 1038, '<url>': 1030, 'congrats': 2437, 'year': 9126, 'special': 7531, 'cinema': 2278, 'pass': 6094, 'yours': 9176, '09061209465': 258, 'suprman': 7830, 'matrix': 5263, 'starwars': 7638, 'etc': 3230, 'bx420': 2014, 'ip4': 4502, '5we': 781, '150pm': 417, 'later': 4861, 'meeting': 5305, 'where': 8846, 'reached': 6652, 'gauti': 3728, 'sehwag': 7110, 'odi': 5846, 'series': 7145, 'pick': 6211, '$': 5, 'burger': 1985, 'yourself': 9177, 'move': 5513, 'pain': 6039, 'killing': 4754, 'good': 3836, 'joke': 4636, 'girls': 3785, 'situation': 7338, 'seekers': 7102, 'part': 6081, 'checking': 2208, 'iq': 4508, 'roommates': 6910, 'took': 8245, 'forever': 3557, 'come': 2371, 'double': 2966, 'check': 2204, 'hair': 3972, 'dresser': 2998, 'said': 6980, 'wun': 9024, 'cut': 2624, 'short': 7248, 'nice': 5701, 'pleased': 6266, 'advise': 1165, 'following': 3534, 'recent': 6692, 'review': 6849, 'mob': 5439, 'awarded': 1549, '1500': 414, 'bonus': 1844, '09066364589': 306, 'song': 7478, 'dedicated': 2722, 'which': 8853, 'dedicate': 2721, 'valuable': 8566, 'frnds': 3643, 'rply': 6925, 'complimentary': 2406, 'trip': 8322, 'eurodisinc': 3234, 'trav': 8304, 'aco': 1119, '41': 679, '1000': 356, 'dis': 2871, '6': 785, 'morefrmmob': 5490, 'shracomorsglsuplt': 7273, '10': 350, 'ls1': 5103, '3aj': 638, 'hear': 4062, 'divorce': 2900, 'barbie': 1620, 'comes': 2373, "ken's": 4728, 'plane': 6247, 'wah': 8682, 'lucky': 5114, 'save': 7029, 'money': 5470, 'hee': 4075, 'finished': 3464, 'hi': 4120, 'babe': 1574, 'im': 4368, 'wanna': 8713, 'something': 7464, 'xx': 9094, 'performed': 6155, 'waiting': 8692, 'machan': 5158, 'once': 5901, 'thats': 8051, 'cool': 2481, 'gentleman': 3751, 'dignity': 2848, 'respect': 6816, 'peoples': 6147, 'very': 8598, 'much': 5544, 'shy': 7283, 'pa': 6027, 'operate': 5928, 'after': 1183, 'same': 6996, 'looking': 5050, 'job': 4623, "ta's": 7896, 'earn': 3070, 'ah': 1204, 'stop': 7688, 'urgnt': 8517, 'real': 6662, 'yo': 9152, 'tickets': 8142, 'one': 5903, 'jacket': 4563, 'used': 8532, 'multis': 5553, 'started': 7632, 'requests': 6797, 'came': 2057, 'bed': 1686, 'coins': 2350, 'factory': 3335, 'gotta': 3860, 'nitros': 5727, 'ela': 3124, 'kano': 4708, 'il': 4362, 'download': 2975, 'wen': 8811, 'don': 2947, 'stand': 7620, 'close': 2313, 'll': 5008, 'another': 1332, 'night': 5710, 'spent': 7550, 'late': 4858, 'afternoon': 1185, 'casualty': 2126, 'means': 5291, "haven't": 4039, 'any': 1346, 'y': 9107, '42moro': 689, 'includes': 4405, 'sheets': 7203, 'smile': 7403, 'pleasure': 6268, 'trouble': 8328, 'pours': 6359, 'rain': 6602, 'sum': 7798, 'hurts': 4297, 'becoz': 1684, 'someone': 7457, 'loves': 5090, 'smiling': 7407, 'service': 7150, 'representative': 6794, '0800 169 6031': 86, 'between': 1741, '10am': 365, '9pm': 1002, 'guaranteed': 3930, '5000': 755, 'havent': 4040, 'planning': 6251, 'buy': 2004, 'lido': 4937, '530': 767, 'show': 7264, 'collected': 2358, 'simply': 7311, 'password': 6102, 'mix': 5421, '85069': 934, 'verify': 8594, 'usher': 8538, 'britney': 1932, 'fml': 3525, 'po': 6284, 'box': 1879, '5249': 764, 'mk17': 5424, '92h': 990, '450ppw': 705, 'telugu': 7991, 'movie': 5516, 'abt': 1084, 'loads': 5014, 'loans': 5016, 'wk': 8928, 'hols': 4174, 'run': 6946, 'forgot': 3565, 'hairdressers': 3974, 'appointment': 1386, 'four': 3584, 'shower': 7266, 'beforehand': 1702, 'cause': 2133, 'prob': 6456, 'coffee': 2345, 'animation': 1319, 'nothing': 5774, 'else': 3138, 'okay': 5877, 'price': 6431, 'long': 5042, 'legal': 4904, 'them': 8061, 'ave': 1536, 'ams': 1301, 'gone': 3832, '4the': 735, 'driving': 3007, 'test': 8014, 'yet': 9142, "you're": 9162, 'mean': 5287, 'guess': 3936, 'gave': 3729, 'boston': 1866, 'men': 5326, 'changed': 2174, 'search': 7080, 'location': 5019, 'nyc': 5819, 'cuz': 2631, 'signin': 7299, 'page': 6035, 'says': 7038, 'umma': 8423, 'life': 4940, 'vava': 8580, 'lot': 5066, 'dear': 2699, 'wishes': 8912, 'birthday': 1777, 'making': 5197, 'truly': 8335, 'memorable': 5323, 'aight': 1216, 'hit': 4141, 'would': 8993, 'ip': 4501, 'address': 1141, 'considering': 2449, 'computer': 2412, "isn't": 4528, 'minecraft': 5380, 'server': 7149, 'grumpy': 3923, 'old': 5889, 'people': 6146, 'mom': 5462, 'better': 1738, 'lying': 5135, 'jokes': 4640, 'worry': 8983, 'busy': 1998, 'plural': 6277, 'noun': 5781, 'research': 6802, 'dinner.msg': 2859, 'cos': 2499, 'things': 8091, 'scared': 7044, 'mah': 5180, 'loud': 5076, 'gent': 3749, 'contact': 2454, 'last': 4855, 'weekends': 8793, 'draw': 2989, 'shows': 7272, '09064012160': 282, 'k52': 4691, '12hrs': 398, '150ppm': 419, 'wa': 8677, 'openin': 5925, 'sentence': 7138, 'formal': 3569, 'anyway': 1360, 'juz': 4682, 'tt': 8348, 'eatin': 3083, 'puttin': 6554, 'weight': 8798, 'haha': 3968, 'anythin': 1355, 'happened': 4003, 'entered': 3185, 'cabin': 2025, "b'day": 1562, 'boss': 1865, 'felt': 3411, 'askd': 1464, 'invited': 4494, 'apartment': 1365, 'went': 8814, 'specially': 7536, 'holiday': 4171, 'flights': 3502, 'inc': 4399, 'operator': 5929, '08712778109': 166, '10p': 368, 'min': 5372, 'goodo': 3846, 'must': 5575, 'friday': 3626, 'egg-potato': 3112, 'ratio': 6631, 'tortilla': 8262, 'needed': 5656, 'hmm': 4153, 'uncle': 8433, 'informed': 4438, 'paying': 6124, 'school': 7050, 'directly': 2865, 'food': 3542, 'private': 6447, '2004': 486, 'account': 1107, 'statement': 7641, '07742676969': 64, '786': 864, 'unredeemed': 8477, 'points': 6297, '08719180248': 213, 'identifier': 4344, '45239': 707, 'expires': 3307, '2000': 484, 'caller': 2045, '5/9': 752, '03': 46, 'landline': 4835, '09064019788': 288, '42wr29c': 690, 'apples': 1381, 'pairs': 6044, 'malarky': 5199, 'todays': 8205, 'voda': 8645, 'numbers': 5804, 'ending': 3160, '7548': 856, '350': 624, 'award': 1548, 'match': 5251, '08712300220': 149, 'quoting': 6589, '4041': 674, 'standard': 7621, 'rates': 6629, 'app': 1375, 'sao': 7004, 'mu': 5542, 'predict': 6392, "ü'll": 9222, 'buying': 2007, 'yetunde': 9144, "hasn't": 4024, 'sent': 7137, 'bother': 1869, 'sending': 7129, 'involve': 4498, "shouldn't": 7259, 'imposed': 4386, 'apologise': 1372, 'girl': 3782, 'del': 2740, 'bak': 1597, 'lucyxx': 5118, 'tmorrow.pls': 8185, 'accomodate': 1102, 'answer': 1335, 'sunshine': 7812, 'quiz': 6584, 'q': 6559, 'top': 8253, 'sony': 7480, 'dvd': 3051, 'player': 6257, 'country': 2518, 'algarve': 1245, 'ansr': 1334, '82277': 907, 'sp': 7516, 'tyrone': 8394, 'laid': 4827, 'dogging': 2932, 'locations': 5020, 'direct': 2864, 'join': 4631, "uk's": 8416, 'largest': 4852, 'bt': 1958, 'txting': 8383, 'gravel': 3888, '69888': 822, 'nt': 5791, 'ec2a': 3086, '31p': 611, '@150p': 1039, 'haf': 3967, 'msn': 5534, 'yijue@hotmail.com': 9149, 'him': 4132, 'rooms': 6911, 'befor': 1699, 'activities': 1129, "you'll": 9161, 'msgs': 5533, 'chat': 2195, ...}
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))
terms[:5]
('!', '"', '#', '#150', '#5000')
column_nums[:5]
(0, 1, 2, 3, 4)
weights = pd.DataFrame(pca.components_, columns = terms,
index = ['topic{}'.format(i) for i in range(16)])
pd.options.display.max_columns = 12
weights.head(4).round(3)
! | " | # | #150 | #5000 | $ | ... | ’ | “ | … | ┾ | 〨ud | 鈥 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
topic0 | -0.071 | 0.008 | -0.001 | -0.000 | -0.001 | 0.003 | ... | -0.001 | -0.001 | -0.002 | 0.001 | 0.001 | 0.001 |
topic1 | 0.064 | 0.008 | 0.000 | -0.000 | -0.001 | -0.001 | ... | -0.001 | -0.001 | 0.003 | 0.001 | 0.001 | 0.001 |
topic2 | 0.071 | 0.027 | 0.000 | 0.001 | 0.002 | 0.000 | ... | 0.000 | 0.001 | 0.002 | -0.001 | -0.001 | -0.001 |
topic3 | -0.059 | -0.032 | -0.001 | -0.000 | -0.001 | 0.001 | ... | -0.000 | -0.000 | 0.001 | 0.001 | 0.001 | 0.001 |
4 rows × 9232 columns
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3) * 100
deals
! | ;) | :) | half | off | free | crazy | deal | only | $ | 80 | % | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
topic0 | -7.1 | 0.1 | -0.5 | -0.0 | -0.4 | -2.0 | -0.0 | -0.1 | -2.2 | 0.3 | -0.0 | -0.0 |
topic1 | 6.4 | 0.0 | 7.4 | 0.1 | 0.4 | -2.3 | -0.2 | -0.1 | -3.8 | -0.1 | -0.0 | -0.2 |
topic2 | 7.1 | 0.2 | -0.1 | 0.0 | 0.3 | 4.4 | 0.1 | -0.1 | 0.7 | 0.0 | 0.0 | 0.1 |
topic3 | -5.9 | -0.3 | -7.1 | 0.2 | 0.3 | -0.2 | 0.0 | 0.1 | -2.3 | 0.1 | -0.1 | -0.3 |
topic4 | 38.1 | -0.1 | -12.5 | -0.1 | -0.2 | 9.9 | 0.1 | -0.2 | 3.0 | 0.3 | 0.1 | -0.1 |
topic5 | -26.5 | 0.1 | -1.6 | -0.3 | -0.7 | -1.4 | -0.6 | -0.2 | -1.8 | -0.9 | 0.0 | 0.0 |
topic6 | -10.9 | -0.5 | 19.9 | -0.4 | -0.9 | -0.6 | -0.2 | -0.1 | -1.4 | -0.0 | -0.0 | -0.1 |
topic7 | 15.9 | 0.1 | -18.3 | 0.8 | 0.8 | -2.9 | 0.0 | 0.1 | -1.8 | -0.3 | 0.0 | -0.1 |
topic8 | 35.0 | 0.2 | 5.9 | -0.5 | -0.5 | 0.3 | -0.4 | -0.4 | 3.0 | -0.6 | -0.0 | -0.2 |
topic9 | 9.2 | -0.3 | 18.7 | 1.4 | -0.8 | 6.8 | -0.5 | -0.4 | 3.1 | -0.5 | -0.0 | -0.0 |
topic10 | -31.9 | -0.2 | -7.6 | 0.1 | 0.2 | 12.9 | 0.1 | -0.0 | 0.2 | -0.0 | -0.0 | -0.2 |
topic11 | -23.0 | -0.4 | -15.8 | -0.5 | -1.2 | 8.1 | 0.0 | -0.2 | 0.2 | 0.5 | 0.0 | 0.3 |
topic12 | -22.9 | -0.2 | 37.1 | -0.1 | 0.2 | -4.9 | -0.6 | 0.2 | 3.4 | 0.2 | -0.0 | 0.3 |
topic13 | 16.5 | -0.2 | 27.3 | -0.3 | 0.8 | 3.7 | 0.5 | 0.2 | -2.8 | -0.4 | -0.0 | -0.2 |
topic14 | 12.3 | -0.3 | 21.3 | -0.5 | -1.2 | -0.9 | -0.0 | 0.2 | 4.1 | -0.4 | 0.1 | -0.4 |
topic15 | -15.9 | -0.3 | 1.9 | 1.1 | -1.0 | 5.1 | -0.4 | 0.5 | -0.2 | -0.3 | 0.0 | -0.2 |
deals = weights['dog'.split()].round(3)
deals
dog | |
---|---|
topic0 | 0.000 |
topic1 | 0.002 |
topic2 | -0.001 |
topic3 | 0.002 |
topic4 | -0.002 |
topic5 | -0.000 |
topic6 | -0.001 |
topic7 | 0.000 |
topic8 | -0.000 |
topic9 | -0.002 |
topic10 | -0.002 |
topic11 | 0.008 |
topic12 | 0.001 |
topic13 | -0.002 |
topic14 | -0.001 |
topic15 | 0.005 |
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
data = get_data_from_files('../Documents/IST_736_TextMining/AmazonPhotoTextCorpus/')
df = pd.DataFrame(data)
all_df = df
import re
def clean_rogue_characters_2(string):
return re.sub('[^0-9a-zA-Z.]+', ' ', string)
all_df['clean'] = all_df.apply(lambda x: clean_rogue_characters_2(x[0]), axis=1)
all_df
0 | clean | |
---|---|---|
0 | *)\niS\n\n11:23 1]\n\nQ Search\n\nIf | shine a... | iS 11 23 1 Q Search If shine a white LED ligh... |
1 | 6:51 .\n\n*)\n0\n\n® forums.nexusmods.com\n\ne... | 6 51 . 0 forums.nexusmods.com ee BrettM fosaym... |
2 | 6:52 al Se)\n\n® forums.nexusmods.com\n\n17 Ap... | 6 52 al Se forums.nexusmods.com 17 Apr 2012 St... |
3 | 4:06 aw Fe\n\n@ spokesman.com (4\n\nILULL. LL ... | 4 06 aw Fe spokesman.com 4 ILULL. LL 1S SUUULI... |
4 | Parents do not own their children.\nNo one own... | Parents do not own their children. No one owns... |
5 | o | changed shampoos, cut out dairy,\nlitres o... | o changed shampoos cut out dairy litres of wat... |
6 | WEIS °2P)40p" o2ua8- yp <i\n\n \n\ndeyiaao SpJ... | WEIS 2P 40p o2ua8 yp i deyiaao SpJ0M Jo 3eq su... |
7 | casispie:\nhugealienpie:\nthechubbynerd:\njust... | casispie hugealienpie thechubbynerd just showe... |
8 | 527k J @® 173k it, Share Oo\n\nelfmere * 16h\n... | 527k J 173k it Share Oo elfmere 16h Why do you... |
9 | 6:55 at > =)\n@ google.com (h\n= Google ©\nstr... | 6 55 at google.com h Google strange women lyin... |
10 | 1:51 =\n< Mail\n@ glassdoor.com\n\nas\nGitHub\... | 1 51 Mail glassdoor.com as GitHub Policy Detai... |
11 | ™ ‘/fantasywriters\nu/SlinkySlang * 3h\n\nWrit... | fantasywriters u SlinkySlang 3h Writing advic... |
12 | 10:21 wil @\n\n< Mail\n\n@ jobs.capitalgroup.... | 10 21 wil Mail jobs.capitalgroup.com Responsib... |
13 | 177k ¥ @ 3.9k it, Share Oo\n\nGalacticPingvin ... | 177k 3.9k it Share Oo GalacticPingvin 3h lemla... |
14 | a. Ricky Montgomery\n\n| am upset with my pare... | a. Ricky Montgomery am upset with my parents f... |
15 | ig nyx5\n\ni prefer guys who make small dick j... | ig nyx5 i prefer guys who make small dick joke... |
16 | 427% @ 27 it, Share Oo\n\nmrtrouble22 % + 4h »... | 427 27 it Share Oo mrtrouble22 4h Believer i r... |
17 | 6:52 al Se)\n\n@ forums.nexusmods.com (4\n\nas... | 6 52 al Se forums.nexusmods.com 4 as Soon as y... |
18 | 11:54 a eG)\n\nsciencedaily.com\n\n- —_—_—_\n\... | 11 54 a eG sciencedaily.com The medical commun... |
19 | @ lore-54352452524-deactivated201\n\nyou can r... | lore 54352452524 deactivated201 you can reple... |
20 | 4128 @ 8 it, Share We)\n\nDatadevourer * 3h\n\... | 4128 8 it Share We Datadevourer 3h Recently st... |
21 | @ 53.2k § @ 176k it, Share o\n\nPyroSnakel41 «... | 53.2k 176k it Share o PyroSnakel41 13h Become... |
22 | r/nosurf\nu/fibonacciseries * 9d\n\nTurning on... | r nosurf u fibonacciseries 9d Turning on the G... |
23 | * Vote J ™ 10 it, Share Oo\n\nit2051229 * 36m\... | Vote J 10 it Share Oo it2051229 36m In the be... |
24 | 4:50 >\nQ Search\nNews Home Popular\n\n20k J @... | 4 50 Q Search News Home Popular 20k J 15 it Sh... |
25 | e205 ™ 16 it, Share o\n\niseemath ° 18h\n\n| t... | e205 16 it Share o iseemath 18h think you ll f... |
26 | @ 124k J @ 49k it, Share Oo\nGo © 234\n\n15 MO... | 124k J 49k it Share Oo Go 234 15 MORE REPLIES... |
27 | @ 124k J @ 49k it, Share Oo\n\n10 MORE REPLIES... | 124k J 49k it Share Oo 10 MORE REPLIES sudden... |
28 | 2:07 al > a\n\n2 Messages\n\n€ Back Front? Bac... | 2 07 al a 2 Messages Back Front Back NV I m no... |
29 | 527k J @® 173k it, Share Oo\nQ ® Reply # 56k H... | 527k J 173k it Share Oo Q Reply 56k H insertca... |
30 | @ 124k J @ 49k it, Share Oo\n\n2 MORE REPLIES\... | 124k J 49k it Share Oo 2 MORE REPLIES 1 MORE ... |
31 | f r/stopdrinking\nu/creaturefeaturel6 * 10h * ... | f r stopdrinking u creaturefeaturel6 10h 689 d... |
32 | 6:51 al Se)\n\n@ forums.nexusmods.com (4\n\n \... | 6 51 al Se forums.nexusmods.com 4 ToniPrufrock... |
33 | 12:40 1 Fe)\n\nice;comftort ae]\niY wsyusually... | 12 40 1 Fe ice comftort ae iY wsyusuallyjread ... |
34 | 8:00 wl LTE @ )\n@ google.com (hy\n\nIt was Oc... | 8 00 wl LTE google.com hy It was October 28 19... |
35 | 86k | @ 3.5k it, Share Oo\n\nMrBOOMbabdtlc « 7... | 86k 3.5k it Share Oo MrBOOMbabdtlc 7h flinty d... |
36 | 21k J ® 2.1k it, Share Oo\n\nWw DOO! VYUNINIEN... | 21k J 2.1k it Share Oo Ww DOO VYUNINIENISO WwW... |
37 | 4 147k J @ 314 it, Share Oo\n\nmikevago ¢ 4h\n... | 4 147k J 314 it Share Oo mikevago 4h One of my... |
38 | 631 4 ™ 59 it, Share Oo\n\nPublicFigurex % » 2... | 631 4 59 it Share Oo PublicFigurex 28d It s fr... |
39 | r/EatCheapAndHealthy\nu/chickentender1995 ° 12... | r EatCheapAndHealthy u chickentender1995 12h B... |
40 | 423 M6 it, Share Oo\n\nspecific or more detail... | 423 M6 it Share Oo specific or more detailed w... |
41 | 9:01 li LTE@ )\n<4 Clock\n\nQ Search (ED SJ o\... | 9 01 li LTE 4 Clock Q Search ED SJ o Atwater V... |
42 | What is the most a dollar has ever gotten\nyou... | What is the most a dollar has ever gotten you ... |
43 | 2:48 at\n\n<O @\n\nAli Ho >\n\n*)\n8\n\nToday ... | 2 48 at O Ali Ho 8 Today 10 40 AM Recommendati... |
44 | 27k @ 6.2k it, Share Oo\n\nEmpurpledprose « 18... | 27k 6.2k it Share Oo Empurpledprose 18h wrote ... |
45 | 6:52 wi Fe\n4\n\n@ forums.nexusmods.com\n\nbes... | 6 52 wi Fe 4 forums.nexusmods.com best arrows ... |
46 | 4128 @ 8 it, Share We)\n\nUMNKINg ADOUL It ana... | 4128 8 it Share We UMNKINg ADOUL It ana e naQ ... |
47 | @ 124k J @ 49k it, Share Oo\n\n5 MORE REPLIES\... | 124k J 49k it Share Oo 5 MORE REPLIES HonchoM... |
48 | Being a pet owner is like being a\nsugar daddy... | Being a pet owner is like being a sugar daddy.... |
49 | Dana Schwartz @\n@DanaSchwartzzz\n\nBELLE: The... | Dana Schwartz DanaSchwartzzz BELLE There goes ... |
50 | 7:42 all > @)\n€ All inboxes “N\n\nParker from... | 7 42 all All inboxes N Parker from Interview C... |
51 | stupid bumps are that painful, pus filled kind... | stupid bumps are that painful pus filled kinda... |
52 | @ spokesman.com (4\n\nA. Jock itch is normally... | spokesman.com 4 A. Jock itch is normally caus... |
53 | 10:21\n< Mail\n\nef > &)\n\n@ jobs.capitalgrou... | 10 21 Mail ef jobs.capitalgroup.com Date Oct 1... |
54 | 47g Mo it, Share Oo\n\nVULTESIUPIILVIAVII * CI... | 47g Mo it Share Oo VULTESIUPIILVIAVII CII Step... |
55 | a couple of scenes later when Mr. Robot is\nwe... | a couple of scenes later when Mr. Robot is wea... |
56 | 27k @ 6.2k it, Share Oo\n\nTell Rachel | said ... | 27k 6.2k it Share Oo Tell Rachel said Hi. o 9 ... |
57 | 10:01 WF\n4\n\n@ google.com\n\n= Google ©\ndea... | 10 01 WF 4 google.com Google deacon fallout 4 ... |