RiansyahTohamba · January 15, 2021 01:44 · RiansyahTohamba · Jan 15, 2021
diff --git a/simple-processing-text b/simple-processing-text
 import nltk
 # ubah jadi tipe nltk    
 def convertToText(filename):    
    # raw/str -> token/list -> convert ke nltk.Text    
    raw = open(filename).read()
    # type(raw) == string    
    tokens = nltk.word_tokenize(raw)
    # type(tokens) == list    
    # token bisa berupa tanda-baca{?.,etc}, pos = {adverb,adj,}    
    return nltk.Text(tokens)

 def get_context(keyword,filename):    
    nltktxt = convertToText(filename)
    return nltktxt.concordance(keyword)

 def convert_to_wordpos(rawstr):
    sentences = nltk.sent_tokenize(rawstr) 
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences
    
 # ubah jadi word dan tag pos nya
 def text_preprocess(filename):    
    rawstr = open(filename).read()
    return convert_to_wordpos(rawstr)

 # chunk by np-chunk
 def get_chunk_np(sentence):
    grammar = "NP: {<DT>?<JJ>*<NN>}" 
    cp = nltk.RegexpParser(grammar) 
    result = cp.parse(sentence) 
    #     hasil parse ini, target grammar akan diberi tag (NP )
    return result

 # chunk by tag-patterns
 def get_chunk_tagpattern(sentence):
    pass

 # chunk by regex
 def get_chunk_regex(sentence):
    pass

 def printAllNPChunk(sentences):
 #    jika sudah di print, selanjutnya adalah ekstrak relation
 #    caranya gimana ?
    for sen in sentences:
        print(get_chunk_np(sen))
        
 def find_by_postag(sentences, tagkeyword):
    for sent in sentences:
        for wt in sent: 
            word = wt[0]
            tag = wt[1]        
            if (tag == tagkeyword):
                print(word)

 # find NP in sentences
 def findNP(senteChunk):    
    for ch in senteChunk:
        if (type(ch)!= tuple and ch.label() == 'NP'):                    
            print(ch)
	import nltk
	# ubah jadi tipe nltk
	def convertToText(filename):
	# raw/str -> token/list -> convert ke nltk.Text
	raw = open(filename).read()
	# type(raw) == string
	tokens = nltk.word_tokenize(raw)
	# type(tokens) == list
	# token bisa berupa tanda-baca{?.,etc}, pos = {adverb,adj,}
	return nltk.Text(tokens)

	def get_context(keyword,filename):
	nltktxt = convertToText(filename)
	return nltktxt.concordance(keyword)

	def convert_to_wordpos(rawstr):
	sentences = nltk.sent_tokenize(rawstr)
	sentences = [nltk.word_tokenize(sent) for sent in sentences]
	sentences = [nltk.pos_tag(sent) for sent in sentences]
	return sentences

	# ubah jadi word dan tag pos nya
	def text_preprocess(filename):
	rawstr = open(filename).read()
	return convert_to_wordpos(rawstr)

	# chunk by np-chunk
	def get_chunk_np(sentence):
	grammar = "NP: {<DT>?<JJ>*<NN>}"
	cp = nltk.RegexpParser(grammar)
	result = cp.parse(sentence)
	# hasil parse ini, target grammar akan diberi tag (NP )
	return result

	# chunk by tag-patterns
	def get_chunk_tagpattern(sentence):
	pass

	# chunk by regex
	def get_chunk_regex(sentence):
	pass

	def printAllNPChunk(sentences):
	# jika sudah di print, selanjutnya adalah ekstrak relation
	# caranya gimana ?
	for sen in sentences:
	print(get_chunk_np(sen))

	def find_by_postag(sentences, tagkeyword):
	for sent in sentences:
	for wt in sent:
	word = wt[0]
	tag = wt[1]
	if (tag == tagkeyword):
	print(word)

	# find NP in sentences
	def findNP(senteChunk):
	for ch in senteChunk:
	if (type(ch)!= tuple and ch.label() == 'NP'):
	print(ch)