Pandas

Pandas: An on-the-go “cheat sheet”

==================================

PRO TIP: do a ctrl f first

==================================

Pandas dictionary dict to df


import pandas as pd
df = pd.DataFrame(z3_sorted.items())
df

Merge multiple dictionaries

def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

z = merge_two_dicts(d1, d2)
z2 = merge_two_dicts(z, d3)

Sort dictionary python

sorted_dict = dict(sorted(unsorted_dict.items(), key=lambda item: item[1], reverse=True))

Return multiple things from a lambda

# make columns first
df['year'] = ""
df['month'] = ""
df['day'] = ""
def get_dates(filename):
    try:
        arr = filename.split('___')
        return [arr[0], arr[1], arr[2]]
    except:
        return [None, None, None]

df[['year','month','day']] = df.apply(lambda x: get_dates(x['filename']), axis=1, result_type="expand")

python - How to select rows from a DataFrame based on column values - Stack Overflow

python - AttributeError: Series object has no attribute value - Stack Overflow

python - Check if a given key already exists in a dictionary - Stack Overflow

python - Pythonic Way to reverse nested dictionaries - Stack Overflow

python - How to iterate over rows in a DataFrame in Pandas - Stack Overflow

SELECT ROWS WHOSE COLUMN VALUE IS OR IS IN

IN SQL

SELECT *
FROM table
WHERE colume_name = some_value

IN PYTHON

df.loc[df['column_name'] == some_value]

df.loc[df['column_name'].isin(some_values)]

df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

SELECT ALL ROWS WHERE COLUMN CONTAINS

df[df['A'].str.contains("hello")]

PIVOT TABLES

# Year    Country          medal    no of medals
# 1896    Afghanistan      Gold        5
# 1896    Afghanistan      Silver      4
# 1896    Afghanistan      Bronze      3
# 1896    Algeria          Gold        1
# 1896    Algeria          Silver      2
# 1896    Algeria          Bronze      3

# Year    Country      Gold   Silver   Bronze
# 1896    Afghanistan    5      4         3
# 1896    Algeria        1      2         3

# medals = df.pivot_table('no of medals', ['Year', 'Country'], 'medal')

Pandas Plotting

stacked bars

To numeric

Use pd.to_numeric with errors=’coerce’

pd.to_numeric(df['column'], errors='coerce')
pd.to_numeric(df['column'], errors='coerce').fillna(0)

ADD A COLUMN FROM A GROUPBY

WHAT IS THIS MAGIC!? SO SPECIFICALLY THIS

bdata['group_MarketCap'] = bdata.groupby('yearmonth')['MarketCap'].transform('sum')

Select rows whos columns are

df.loc[df['column_name'].isin(some_values)]

Rename column

df=df.rename(columns = {'two':'new_name'})

add color to seaborn

palette

Creating multiple columns from an apply

so

appiled_df = df.apply(lambda row: get_times(row.time_usec), axis='columns', result_type='expand')
df = pd.concat([df, appiled_df], axis='columns')

MERGING

## WHAT THE DOCS SAID
df1.merge(df2, left_on='ds')

## WHAT STACKOVERFLOW SAID (and what worked)
df_merged = pd.merge(df1, df2, how='left', on='ds', suffixes=('_v1', '_v2'))

UNIQUE

Numpy unique

>>> a = numpy.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
>>> unique, counts = numpy.unique(a, return_counts=True)
>>> dict(zip(unique, counts))
{0: 7, 1: 4, 2: 1, 3: 2, 4: 1}
Non-numpy way:

Use collections.Counter;

>> import collections, numpy

>>> a = numpy.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
>>> collections.Counter(a)
Counter({0: 7, 1: 4, 3: 2, 2: 1, 4: 1})

Percentage missing

df_na = pd.DataFrame(df.isna().sum())
df_na['percent'] = (df_na[0] / df.shape[0]) *100
df_na.sort_values(by="percent", ascending = False)

last column to first column

python cols = list(df.columns) cols = [cols[-1]] + cols[:-1] df = df[cols] df

python df1 = df[(df.a != -1) & (df.b != -1)]

python deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3) * 100 ['! ;) :) half off free crazy deal only $ 80 %'.split()]

renaming columns

python data.rename(columns={'gdp':'log(gdp)'}, inplace=True)

merging

python new_df = pd.merge(A_df, B_df, how='left', left_on=['A_c1','c2'], right_on = ['B_c1','c2'])

one way to drop columns

python newdf = df[df.columns[2:4]]

another way

columns = ['b', 'c'] df1 = pd.DataFrame(df, columns=columns)

String Contains? And ignore NA

df.a.str.contains("foo", na=False)

Most efficient way to loop!

Most efficient way to loop pandas

Converting Column Content


df['color'] = ['red' if x == 'Z' else 'green' for x in df['Set']]
# w.female[w.female == 'female'] = 1
# w.female[w.female == 'male']   = 0

df['Status'][df['Status'] == 'ham'] = 1
df['Status'][df['Status'] == 'spam'] = 0

Replacing non-zeros with ones

df.astype(bool).astype(int)

Select columns of groupby by sum

df.groupby(['Country', 'Item_Code'])[["Y1961", "Y1962", "Y1963"]].sum()

SO

Problems importing CSV?

Dropping on a conditional

To remove all rows where column ‘score’ is < 50:

df = df.drop(df[df.score < 50].index)

SO

Had to change delimiter to

dirtyFile = pd.read_csv('dirtyfile.csv',  sep='\t')

see here

Resetting index

    all_df.reset_index(drop=True,inplace = True)

Conditionals

conditional_df = all_df[all_df['colname'] == 'val_i_need']

Subsetting specific columns (or dropping)

new = old.filter(['A','B','D'], axis=1)
new = old.drop('B', axis=1)

DF to array

source

df.values

Sum + Count of Column

df['column']=='yes').sum()
df['column']=='yes').count()

# using .apply
all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)

Conditional creation of column

df['color'] = ['red' if x == 'Z' else 'green' for x in df['Set']]

import numpy as np
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')

stack overflow

Using lambdas (is this correct?)

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)
all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)

getting bag of words from column?

def get_bow_from_column(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

# pos_bow = get_bow_from_column(all_df, 'pos_dict')

Tuples to DF

# data in the form of list of tuples
data = [('Peter', 18, 7),
        ('Riff', 15, 6),
        ('John', 17, 8),
        ('Michel', 18, 7),
        ('Sheli', 17, 5) ]


# create DataFrame using data
df = pd.DataFrame.from_records(data, columns =['Team', 'Age', 'Score'])

print(df)