Skip to content

Instantly share code, notes, and snippets.

@cwidmer
Created December 5, 2013 21:08
Show Gist options
  • Save cwidmer/7813931 to your computer and use it in GitHub Desktop.
Save cwidmer/7813931 to your computer and use it in GitHub Desktop.
python module to create shogun objects for dealing with string kernels and string data withing the COFFIN framework
#!/usr/bin/env python2.5
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Written (W) 2010-2013 Christian Widmer
# Copyright (C) 2010-2013 Max-Planck-Society, TU-Berlin, MSKCC
"""
module to create shogun data objects according to given parameters
"""
from shogun.Classifier import SVMLight, LibLinear, L2R_LR
from shogun.Kernel import WeightedDegreeStringKernel, LinearKernel, PolyKernel, GaussianKernel, CTaxonomy
from shogun.Features import StringCharFeatures, RealFeatures, CombinedFeatures, StringWordFeatures, SortWordString
from shogun.Features import DNA, PROTEIN, BinaryLabels
from shogun.Kernel import WeightedDegreeStringKernel, CombinedKernel, WeightedCommWordStringKernel, WeightedDegreePositionStringKernel, WeightedDegreeRBFKernel
from shogun.Features import StringCharFeatures, DNA, StringWordFeatures, CombinedFeatures
from shogun.Features import CombinedDotFeatures, HashedWDFeatures, HashedWDFeaturesTransposed, WDFeatures, ImplicitWeightedSpecFeatures, StringByteFeatures
import numpy
def create_labels(labels):
"""
create shogun labels
"""
return BinaryLabels(numpy.double(labels))
########################################################
# string-kernel based stuff
########################################################
def get_spectrum_features(data, order=3, gap=0, reverse=True):
"""
create feature object used by spectrum kernel
"""
charfeat = StringCharFeatures(data, DNA)
feat = StringWordFeatures(charfeat.get_alphabet())
feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc = SortWordString()
preproc.init(feat)
feat.add_preprocessor(preproc)
feat.apply_preprocessor()
return feat
def get_wd_features(data, feat_type="dna"):
"""
create feature object for wdk
"""
if feat_type == "dna":
feat = StringCharFeatures(DNA)
elif feat_type == "protein":
feat = StringCharFeatures(PROTEIN)
else:
raise Exception("unknown feature type")
feat.set_features(data)
return feat
def create_empty_promoter_kernel(degree_wdk, kernel_cache=1000):
"""
creates an uninitialized promoter kernel
"""
kernel_center = WeightedDegreeStringKernel(degree_wdk)
"""
#TODO: enable shifts
# centered WDK/WDK-shift
if True:
else:
kernel_center = WeightedDegreePositionStringKernel(10, param["degree"])
shifts_vector = numpy.ones(param["center_offset"]*2, dtype=numpy.int32)*param["shifts"]
kernel_center.set_shifts(shifts_vector)
"""
kernel_center.set_cache_size(kernel_cache/3)
# border spetrum kernels
size = kernel_cache/3
use_sign = False
kernel_left = WeightedCommWordStringKernel(size, use_sign)
kernel_right = WeightedCommWordStringKernel(size, use_sign)
# assemble combined kernel
kernel = CombinedKernel()
kernel.append_kernel(kernel_center)
kernel.append_kernel(kernel_left)
kernel.append_kernel(kernel_right)
return kernel
def create_promoter_kernel(examples, center_offset, center_pos, degree_wdk, degree_spectrum, kernel_cache=1000):
"""
creates a promoter kernel
"""
# create uninitialized kernel
kernel = create_empty_promoter_kernel(degree_wdk, kernel_cache)
# get features
feat = create_promoter_features(examples, center_offset, center_pos)
# init combined kernel
kernel.init(feat, feat)
return kernel
def create_promoter_features(data, center_offset, center_pos):
"""
creates promoter combined features
"""
print "creating promoter features"
(center, left, right) = split_data_promoter(data, center_offset, center_pos)
# sanity check sequences
assert len(center) == len(left) == len(right)
for i in xrange(1, len(center)):
assert len(data[i]) == len(data[0]), "data length mismatch %i: %i!=%i" % (i, len(data[0]), len(data[i]))
assert len(center[i]) == len(center[0]), "center length mismatch %i: %i!=%i" % (i, len(center[0]), len(center[i]))
assert len(left[i]) == len(left[0]), "left length mismatch %i: %i!=%i" % (i, len(left[0]), len(left[i]))
assert len(right[i]) == len(right[0]), "right length mismatch %i: %i!=%i" % (i, len(right[0]), len(right[i]))
# set up base features
feat_center = StringCharFeatures(DNA)
feat_center.set_features(center)
feat_left = get_spectrum_features(left)
feat_right = get_spectrum_features(right)
# construct combined features
feat = CombinedFeatures()
feat.append_feature_obj(feat_center)
feat.append_feature_obj(feat_left)
feat.append_feature_obj(feat_right)
return feat
def split_data_promoter(data, center_offset, center_pos):
'''
split promoter data in three parts
@param data:
'''
center = [seq[(center_pos - center_offset):(center_pos + center_offset)] for seq in data]
left = [seq[0:center_pos] for seq in data]
right = [seq[center_pos:] for seq in data]
#print left, center, right
return (center, left, right)
########################################################
# linear stuff
########################################################
def create_hashed_promoter_features(data, center_offset, center_pos, degree_wdk, degree_spectrum):
"""
creates a promoter feature object
"""
print "creating __hashed__ promoter features (for linear SVM)"
(center, left, right) = split_data_promoter(data, center_offset, center_pos)
# set up base features
feats_center = create_hashed_features_wdk(center, degree_wdk)
feats_left = create_hashed_features_spectrum(left, degree_spectrum)
feats_right = create_hashed_features_spectrum(right, degree_spectrum)
# create combined features
feats = CombinedDotFeatures()
feats.append_feature_obj(feats_center)
feats.append_feature_obj(feats_left)
feats.append_feature_obj(feats_right)
return feats
def create_hashed_features_wdk(data, degree):
"""
creates hashed dot features for the wdk
"""
# fix parameters
start_degree = 0
hash_bits = 4
order = 1
gap = 0
reverse = True
# create raw features
feats_char = StringCharFeatures(data, DNA)
feats_raw = StringByteFeatures(DNA)
feats_raw.obtain_from_char(feats_char, order-1, order, gap, reverse)
# finish up
feats = HashedWDFeaturesTransposed(feats_raw, start_degree, degree, degree, hash_bits)
#feats = HashedWDFeatures(feats_raw, start_degree, degree, degree, hash_bits)
#feats = WDFeatures(feats_raw, 1, 8)#, degree, hash_bits)
return feats
def create_hashed_features_spectrum(data, degree):
"""
creates hashed dot features for the spectrum kernel
"""
# extract parameters
order = degree
# fixed parameters
gap = 0
reverse = True
normalize = True
# create features
feats_char = StringCharFeatures(data, DNA)
feats_word = StringWordFeatures(feats_char.get_alphabet())
feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse)
# create preproc
preproc = SortWordString()
preproc.init(feats_word)
feats_word.add_preprocessor(preproc)
feats_word.apply_preprocessor()
# finish
feats = ImplicitWeightedSpecFeatures(feats_word, normalize)
return feats
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment