Skip to content

Instantly share code, notes, and snippets.

Last active December 15, 2015 21:29
Show Gist options
  • Save cdfox/5326259 to your computer and use it in GitHub Desktop.
Save cdfox/5326259 to your computer and use it in GitHub Desktop.
Preprocess a set of documents, one on each line of the input file, for LDA inference.
// For each line of the input file, remove nonalphanumeric characters,
// lowercase all letters, remove stopwords, and write the result to the output
// file. I used the answer here as a template for reading/writing files:
package main
import (
func main() {
if len(os.Args) < 4 {
fmt.Println("Too few arguments. Usage: preprocess IN_FILE STOPWORD_FILE OUT_FILE")
// open input file
infile, err := os.Open(os.Args[1])
if err != nil {
// close infile on exit and check for its returned error
defer func() {
if infile.Close() != nil {
// make a read buffer
reader := bufio.NewReader(infile)
// build stopword set
stopwordfile, err := os.Open(os.Args[2])
if err != nil {
stopwordreader := bufio.NewReader(stopwordfile)
stopwords := make(map[string]bool)
for {
line, err := stopwordreader.ReadString('\n')
if err != nil && err != io.EOF {
word := strings.TrimSpace(line)
stopwords[word] = true
if err == io.EOF {
// open output file
outfile, err := os.Create(os.Args[3])
if err != nil {
// close outfile on exit and check for its returned error
defer func() {
if outfile.Close() != nil {
// make a write buffer
writer := bufio.NewWriter(outfile)
// remove nonalphanumeric characters, lowercase,
// and remove stopwords for each line
for {
line, r_err := reader.ReadString('\n')
if r_err != nil && r_err != io.EOF {
nonalphanumeric, err := regexp.Compile(`\W`)
if err != nil {
alphanumeric := nonalphanumeric.ReplaceAllString(line, " ")
lowercase := strings.ToLower(alphanumeric)
tokens := strings.Fields(lowercase)
filtered := []string{}
for _, word := range tokens {
if !stopwords[word] {
filtered = append(filtered, word)
if len(filtered) > 0 {
csv := strings.Join(filtered, ",")
// write a line
if _, err := writer.WriteString(csv + "\n"); err != nil {
if r_err == io.EOF {
if err = writer.Flush(); err != nil {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment