Last active
April 9, 2019 12:18
-
-
Save psorianom/1996422a692f111883450cd36f62e72b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Genreates a syntethic dataset (csv) of persons to test the SNU_assignator | |
Usage: | |
SNU_gen.py <o> [options] | |
Arguments: | |
<o> An output path to store the ysntethic data csv | |
-n PER Number of persons to generate [default: 2000:int] | |
-f FIL Representation proportion of the filiere. Ex: "0.1,0.1,...,0.1" (default: None) | |
-r RES Representation proportion of the residence Ex: "0.1,0.1,...,0.1" (default: None) | |
-s SEX Representation proportion of the residence Ex: "0.3,0.7" (default: None) | |
''' | |
import logging | |
from math import isclose | |
import pandas as pd | |
import numpy as np | |
from argopt import argopt | |
logger = logging.getLogger() | |
logger.setLevel(logging.INFO) | |
FIXED_VALUES = { | |
"filiere": ["LGT", "LP", "MILO", "CFA", "EMS", "PJJ", "SHN", "Actif"], | |
"residence": ["Ardennes", "Puy-de-Dôme", "Cher", "Haute-Saône", "Morbihan", "Eure", "Haute-Pyrénées", | |
"Loire-Atlantique", | |
"Vaucluse", "Guyane", "Nord", "Creuse", "Val d'Oise"], | |
"sexe": ["F", "M"] | |
} | |
def generate_syn_data(output_path, n_persons, proportions): | |
dict_values = {} | |
for col in ["filiere", "residence", "sexe"]: | |
dict_values[col] = np.random.choice(FIXED_VALUES[col], n_persons, p=proportions[col]) | |
df = pd.DataFrame(dict_values, index=np.arange(n_persons)) | |
df.to_csv(output_path, encoding="utf8") | |
pass | |
if __name__ == '__main__': | |
parser = argopt(__doc__).parse_args() | |
output_path = parser.o | |
n_persons = parser.n | |
proportions = {} | |
proportions["filiere"] = parser.f | |
proportions["residence"] = parser.r | |
proportions["sexe"] = parser.s | |
for value, proportion in proportions.items(): | |
# TODO check if any of these values asigned it is None | |
if proportion: | |
filiere_prop = np.array(list(map(float, proportion.split(",")))) | |
if len(filiere_prop) != len(FIXED_VALUES[value]) or not isclose(sum(filiere_prop), 1.): | |
logger.error( | |
"{0} proportion was indicated but does not match the nummber of {1} available.".format(value, value), | |
"Or it is not a valid prob distribution (!= 1.0)", | |
"Using the same proportion for each one.") | |
proportions[value] = np.ones(len(FIXED_VALUES[value])) / len(FIXED_VALUES[value]) | |
else: | |
proportions[value] = filiere_prop | |
else: | |
proportions[value] = np.ones(len(FIXED_VALUES[value])) / len(FIXED_VALUES[value]) | |
generate_syn_data(output_path=output_path, n_persons=n_persons, proportions=proportions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment