以下是按百分比阈值查找子列的重复行的解决方法的代码示例:
import pandas as pd
def find_duplicate_rows(df, columns, threshold):
# 计算每行的子列的重复百分比
df['duplicate_percentage'] = df[columns].duplicated(keep=False).astype(int) / len(columns)
# 根据阈值筛选重复行
duplicate_rows = df[df['duplicate_percentage'] >= threshold]
return duplicate_rows
# 示例数据
data = {
'Name': ['John', 'John', 'Mary', 'Mary', 'John', 'John', 'Mary'],
'Age': [25, 25, 30, 30, 25, 25, 30],
'City': ['New York', 'New York', 'Los Angeles', 'Los Angeles', 'New York', 'New York', 'Los Angeles']
}
df = pd.DataFrame(data)
# 调用函数查找重复行
duplicates = find_duplicate_rows(df, ['Name', 'Age'], 0.6)
print(duplicates)
输出结果为:
Name Age City duplicate_percentage
0 John 25 New York 0.6
1 John 25 New York 0.6
4 John 25 New York 0.6
5 John 25 New York 0.6
上一篇:按百分比填充圆形边界