Skip to content

Instantly share code, notes, and snippets.

@cghiban
Last active July 26, 2021 20:19
Show Gist options
  • Save cghiban/1389b305007ef730d908a6358d8ce8a1 to your computer and use it in GitHub Desktop.
Save cghiban/1389b305007ef730d908a6358d8ce8a1 to your computer and use it in GitHub Desktop.
retrieve ncbi data based on accession number in files
package main
import (
"bufio"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"path"
"regexp"
"strings"
"sync"
"time"
)
const (
chunkSize = 10 // search for up to $chunkSize accession #s
numCoroutines = 4 // parallel requests
)
var re *regexp.Regexp
var apiKey string
func getAccFromFile(fpath string) []string {
accn := []string{}
fmt.Printf("+ working on: %s\n", fpath)
f, err := os.Open(fpath)
if err != nil {
log.Println(err)
return accn
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
accn = append(accn, line)
}
return accn
}
func splitSlice(s []string, size int) [][]string {
out := [][]string{}
for i := 0; i < len(s); i += size {
j := i + size
if j > len(s) {
j = len(s)
}
ss := s[i:j]
//fmt.Printf("%3d %q\n", i, ss)
out = append(out, ss)
}
return out
}
//curl -sk -X POST -d "id=${IDLIST}&email=dnalcadmin@cshl.edu&db=nuccore" https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?api_key=${APIKEY}
//curl -s -o $OUTFILE "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=${FORMAT}&WebEnv=${WE}&query_key=1&retstart=0&retmax=500"
func getNCBIData(accn []string) []byte {
//return []byte(strings.Join(accn, ",") + "\n\n")
if len(accn) == 0 {
return []byte{}
}
client := &http.Client{
Timeout: 59 * time.Second,
Transport: &http.Transport{
MaxIdleConnsPerHost: numCoroutines,
MaxIdleConns: 100,
},
}
endpoint := fmt.Sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?api_key=%s", apiKey)
postData := url.Values{
"id": {strings.Join(accn, ",")},
"db": {"nuccore"},
"email": {"dnalcadmin@cshl.edu"},
}
//fmt.Printf("%s\n", postData.Encode())
tries := 3
webEnv := ""
for tries > 0 {
resp, err := client.PostForm(endpoint, postData)
if err != nil {
log.Println(err)
}
//fmt.Printf("+ Status: %s\n", resp.Status)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
//fmt.Println(string(body))
//xx := re.FindStringSubmatch(body)
mm := re.FindSubmatch(body)
if len(mm) > 0 {
//fmt.Printf("%d\t%v", len(mm), mm[1])
webEnv = string(mm[1])
break
}
time.Sleep(time.Duration(3-tries) * time.Second)
tries--
}
//fmt.Printf("WebEnd: %s\n", webEnv)
if webEnv == "" {
return []byte{}
}
time.Sleep(time.Second)
//curl -s -o $OUTFILE "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
//db=nuccore&rettype=${FORMAT}&WebEnv=${WE}&query_key=1&retstart=0&retmax=500"
endpoint = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
postData = url.Values{
"api_key": {apiKey},
"db": {"nuccore"},
"rettype": {"gb"},
"WebEnv": {webEnv},
"query_key": {"1"},
"restart": {"0"},
"retmax": {"500"},
}
//fmt.Println(postData.Encode())
body := []byte{}
tries = 2
for tries > 0 {
resp, err := client.PostForm(endpoint, postData)
if err != nil {
log.Println(err)
}
//fmt.Printf("+ Status: %s\n", resp.Status)
defer resp.Body.Close()
body, _ = io.ReadAll(resp.Body)
if resp.StatusCode == 200 {
break
}
//fmt.Println("+", string(body))
time.Sleep(time.Duration(3-tries) * time.Second)
tries--
body = []byte{}
}
return body
}
func init() {
apiKey = os.Getenv("NCBI_API_KEY")
if "" == apiKey {
fmt.Println("\nMissing NCBI_API_KEY. Press Ctrl-C to cancel...")
time.Sleep(3 * time.Second)
}
}
func main() {
if len(os.Args) != 2 {
log.Fatal("missing directory as argument...")
}
dir := os.Args[1]
files, err := os.ReadDir(dir)
if err != nil {
log.Fatal(err)
os.Exit(1)
}
// init regexps to be used later
re = regexp.MustCompile(`<WebEnv>(.+)</WebEnv>`)
reExt := regexp.MustCompile(`\.new$`)
for _, file := range files {
if !strings.HasSuffix(file.Name(), ".new") {
continue
}
filePath := path.Join(dir, file.Name())
outPath := reExt.ReplaceAllString(filePath, ".gb")
//fmt.Println(outPath)
// do we overwrite the output?
if _, err := os.Stat(outPath); !os.IsNotExist(err) {
fmt.Printf("EE: output file %s already exists\n", outPath)
continue
}
accn := getAccFromFile(filePath)
//fmt.Printf("%s\t%v\n", file.Name(), accn)
//defer fmt.Println("by " + file.Name())
// open output file
fo, err := os.Create(outPath)
if err != nil {
panic(err)
}
// close
defer fo.Close()
// wg is used to wait for the program to finish.
var wg sync.WaitGroup
// devide work into chunks
workPool := splitSlice(accn, chunkSize)
numJobs := len(workPool)
// Launch goroutines to handle the work.
wg.Add(len(workPool))
running := make(chan bool, numCoroutines) // Limit concurrent jobs to ....
collectedData := make([][]byte, numJobs)
for i, ids := range workPool {
running <- true // Fill running; this will block and wait if it's already full.
fmt.Printf("len(running)=%d\tcap(running)=%d\n", len(running), cap(running))
// start the work
go func(i int, ids []string) {
defer func() {
<-running // Drain running so new jobs can be added.
wg.Done() // Signal that this job is done.
}()
data := getNCBIData(ids)
if len(data) != 0 {
collectedData[i] = data
}
//fmt.Println(data)
}(i, ids)
//break
}
wg.Wait() // Wait until all jobs are done.
fmt.Println("done")
// make a write buffer
w := bufio.NewWriter(fo)
for _, data := range collectedData {
if _, err := w.Write(data); err != nil {
fmt.Println("can't write data to file: ", err)
break
}
w.Write([]byte("\n\n"))
}
if err = w.Flush(); err != nil {
fmt.Println(err)
}
break
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment