""" __author__ = 'jockqiao' __date__ = '2018/9/13  9:36' __email__ = 'LQ65535@163.com' """ 
import pandas as pd 
import numpy as np 
import random
df = pd.read_excel('behaviour.xlsx')
name = df.columns
name = list(name)[:] print(name)
df_0 = df.loc[df['label'] == 0]
df_1 = df.loc[df['label'] == 1]
row1 = df_0.shape[0]
row2 = df_1.shape[0]
rate = [0.5, 0.5] def copy_data(df, df_0, df_1, row1, row2, rate): """  :param df:  :param row1:  :param row2:  :param rate: 少样本大样本的最终比率,第一个是0的,第二个是1的  :return:  """  # 少数量的个数标签  min = 1 if row1 >= row2 else 0  # 如果1是少样本则先把1扩充  if min == 1: # 把样本量多的作为样本倍率,然后乘得到最终比例  # n 是需要复制多少次  n = int((row1 / (rate[0] * 10)) * (rate[1] * 10)) # 如果最终数量大于原始的  if n > row2:
            n = n - row2
            copy_data = df_1.sample(n=n, replace=True, random_state=0)
            df = df.append(copy_data) # df = df_0.append(copy_data)  # 如果最终数量小于原始的, 删除多余的  if n < row2:
            n = row2 - n
            drop_data = df_1.sample(n=n, replace=True, random_state=0)
            df = df.drop(drop_data) # 复制完之后按列组合  else: pass   df.to_csv('test.csv', index=False) print(type(df_0))

copy_data(df, df_0, df_1, row1, row2, rate)

样本集,label1样例为少数据


原创文章,转载请注明出处:http://124.221.219.47/article/smote/