Last active
September 4, 2024 12:48
-
-
Save robintux/e9e5b4b35b7454de190dd7227dfeff23 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import warnings | |
import random | |
import numpy as np | |
from sklearn.neighbors import NearestNeighbors | |
def smote(T, N, K): | |
""" | |
T ~ an array-like object representing the minority matrix | |
N ~ the percent oversampling you want. e.g. 500 will give you 5 samples | |
from the SMOTE algorithm (thus, has to be multiple of 100). | |
K ~ K Nearest Neighbors | |
""" | |
## make sure T is an array with the proper dimensions | |
T = np.asarray(T, dtype = np.float64) | |
nsamples = T.shape[0] | |
nfeatures = T.shape[1] | |
if nsamples < nfeatures: | |
warnings.warn("Make sure the features are in the columns.") | |
## we want to oversample | |
if N < 100: | |
raise Exception("N should be at least 100") | |
N = int(N) / 100 | |
nn = NearestNeighbors(K) | |
nn.fit(T) | |
synthetic = np.zeros([N * nsamples, nfeatures]) | |
for sample in xrange(nsamples): | |
nn_minority = nn.kneighbors(T[sample], return_distance = False)[0] | |
N_next = N | |
newindex = 0 | |
while N_next != 0: | |
k_chosen = random.randint(0, K - 1) | |
while nn_minority[k_chosen] == sample: # don't pick itself | |
k_chosen = random.randint(0, K - 1) | |
for feature in xrange(nfeatures): | |
diff = T[nn_minority[k_chosen], feature] - T[sample, feature] | |
gap = random.uniform(0, 1) | |
synthetic[N*sample + newindex, feature] = T[sample, feature] + gap * diff | |
newindex += 1 | |
N_next -= 1 | |
return synthetic | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment