# Drop complete duplicates
df.drop_duplicates(inplace = True)
# No of duplicates of specified column combinations
df.duplicated(['col1`', 'col2']).sum()
# Column names to check for partial duplicates
column_names = ['A','B','C']
duplicates = df.duplicated(subset = column_names, keep = False)
# See partial duplicate values
df[duplicates]
# Combine result for partial duplicates
summaries = {'D': 'max', 'E': 'mean'}
df = df.groupby(by = column_names).agg(summaries).reset_index()
#################### Record linkage ##########################
##### Used for Getting rid of duplicates from 2 different dataframes #######
import recordlinkage
# Create indexing object
indexer = recordlinkage.Index()
# Generate pairs blocked on index common in 2 dataframes
indexer.block('col')
pairs = indexer.index(df1, df2)
# See pairs
print(pairs)
# Create a Compare object
compare_cl = recordlinkage.Compare()
# Find exact matches for pairs of col1 and col2
compare_cl.exact('df1_col1', 'df2_col1', label='col1')
compare_cl.exact('df1_col2', 'df2_col2', label='col2')
# Find close matches for pairs of surname and address_1 using string similarity
compare_cl.string('df1_col3', 'df2_col3', threshold=0.85, label='col3')
compare_cl.string('df1_col4', 'df2_col4', threshold=0.85, label='col4')
# Find matches
potential_matches = compare_cl.compute(pairs, df1, df2)
# See potential matches
print(potential_matches)
# Filter matches where more than 2 columns match
matches = potential_matches[potential_matches.sum(axis = 1) => 2]
print(matches)
# See index
matches.index
# Get index of duplicates in df2
duplicate_rows = matches.index.get_level_values(1)
# Finding duplicates in df2
df2_duplicates = df2[df2.index.isin(duplicate_rows)]
# Finding rows in df2 that are not duplicates
df2_unique = df2[~df2.index.isin(duplicate_rows)]
# Link the DataFrames!
full_df = df1.append(df2_unique)