""" __author__ = 'jockqiao' __date__ = '2018/9/13 9:36' __email__ = 'LQ65535@163.com' """
import pandas as pd
import numpy as np
import random df = pd.read_excel('behaviour.xlsx') name = df.columns name = list(name)[:] print(name) df_0 = df.loc[df['label'] == 0] df_1 = df.loc[df['label'] == 1] row1 = df_0.shape[0] row2 = df_1.shape[0] rate = [0.5, 0.5] def copy_data(df, df_0, df_1, row1, row2, rate): """ :param df: :param row1: :param row2: :param rate: 少样本大样本的最终比率,第一个是0的,第二个是1的 :return: """ # 少数量的个数标签 min = 1 if row1 >= row2 else 0 # 如果1是少样本则先把1扩充 if min == 1: # 把样本量多的作为样本倍率,然后乘得到最终比例 # n 是需要复制多少次 n = int((row1 / (rate[0] * 10)) * (rate[1] * 10)) # 如果最终数量大于原始的 if n > row2: n = n - row2 copy_data = df_1.sample(n=n, replace=True, random_state=0) df = df.append(copy_data) # df = df_0.append(copy_data) # 如果最终数量小于原始的, 删除多余的 if n < row2: n = row2 - n drop_data = df_1.sample(n=n, replace=True, random_state=0) df = df.drop(drop_data) # 复制完之后按列组合 else: pass df.to_csv('test.csv', index=False) print(type(df_0))copy_data(df, df_0, df_1, row1, row2, rate)
原创文章,转载请注明出处:http://124.221.219.47/article/smote/