retrieve ncbi data based on accession number in files
package main
import (
const (
chunkSize = 10 // search for up to $chunkSize accession #s
numCoroutines = 4 // parallel requests
var re *regexp.Regexp
var apiKey string
func getAccFromFile(fpath string) []string {
accn := []string{}
fmt.Printf("+ working on: %s\n", fpath)
f, err := os.Open(fpath)
if err != nil {
return accn
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
accn = append(accn, line)
return accn
func splitSlice(s []string, size int) [][]string {
out := [][]string{}
for i := 0; i < len(s); i += size {
j := i + size
if j > len(s) {
j = len(s)
ss := s[i:j]
//fmt.Printf("%3d %q\n", i, ss)
out = append(out, ss)
return out
//curl -sk -X POST -d "id=${IDLIST}&"${APIKEY}
//curl -s -o $OUTFILE "${FORMAT}&WebEnv=${WE}&query_key=1&retstart=0&retmax=500"
func getNCBIData(accn []string) []byte {
//return []byte(strings.Join(accn, ",") + "\n\n")
if len(accn) == 0 {
return []byte{}
client := &http.Client{
Timeout: 59 * time.Second,
Transport: &http.Transport{
MaxIdleConnsPerHost: numCoroutines,
MaxIdleConns: 100,
endpoint := fmt.Sprintf("", apiKey)
postData := url.Values{
"id": {strings.Join(accn, ",")},
"db": {"nuccore"},
"email": {""},
//fmt.Printf("%s\n", postData.Encode())
tries := 3
webEnv := ""
for tries > 0 {
resp, err := client.PostForm(endpoint, postData)
if err != nil {
//fmt.Printf("+ Status: %s\n", resp.Status)
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
//xx := re.FindStringSubmatch(body)
mm := re.FindSubmatch(body)
if len(mm) > 0 {
//fmt.Printf("%d\t%v", len(mm), mm[1])
webEnv = string(mm[1])
time.Sleep(time.Duration(3-tries) * time.Second)
//fmt.Printf("WebEnd: %s\n", webEnv)
if webEnv == "" {
return []byte{}
//curl -s -o $OUTFILE "
endpoint = ""
postData = url.Values{
"api_key": {apiKey},
"db": {"nuccore"},
"rettype": {"gb"},
"WebEnv": {webEnv},
"query_key": {"1"},
"restart": {"0"},
"retmax": {"500"},
body := []byte{}
tries = 2
for tries > 0 {
resp, err := client.PostForm(endpoint, postData)
if err != nil {
//fmt.Printf("+ Status: %s\n", resp.Status)
defer resp.Body.Close()
body, _ = io.ReadAll(resp.Body)
if resp.StatusCode == 200 {
//fmt.Println("+", string(body))
time.Sleep(time.Duration(3-tries) * time.Second)
body = []byte{}
return body
func init() {
apiKey = os.Getenv("NCBI_API_KEY")
if "" == apiKey {
fmt.Println("\nMissing NCBI_API_KEY. Press Ctrl-C to cancel...")
time.Sleep(3 * time.Second)
func main() {
if len(os.Args) != 2 {
log.Fatal("missing directory as argument...")
dir := os.Args[1]
files, err := os.ReadDir(dir)
if err != nil {
// init regexps to be used later
re = regexp.MustCompile(`<WebEnv>(.+)</WebEnv>`)
reExt := regexp.MustCompile(`\.new$`)
for _, file := range files {
if !strings.HasSuffix(file.Name(), ".new") {
filePath := path.Join(dir, file.Name())
outPath := reExt.ReplaceAllString(filePath, ".gb")
// do we overwrite the output?
if _, err := os.Stat(outPath); !os.IsNotExist(err) {
fmt.Printf("EE: output file %s already exists\n", outPath)
accn := getAccFromFile(filePath)
//fmt.Printf("%s\t%v\n", file.Name(), accn)
//defer fmt.Println("by " + file.Name())
// open output file
fo, err := os.Create(outPath)
if err != nil {
// close
defer fo.Close()
// wg is used to wait for the program to finish.
var wg sync.WaitGroup
// devide work into chunks
workPool := splitSlice(accn, chunkSize)
numJobs := len(workPool)
// Launch goroutines to handle the work.
running := make(chan bool, numCoroutines) // Limit concurrent jobs to ....
collectedData := make([][]byte, numJobs)
for i, ids := range workPool {
running <- true // Fill running; this will block and wait if it's already full.
fmt.Printf("len(running)=%d\tcap(running)=%d\n", len(running), cap(running))
// start the work
go func(i int, ids []string) {
defer func() {
<-running // Drain running so new jobs can be added.
wg.Done() // Signal that this job is done.
data := getNCBIData(ids)
if len(data) != 0 {
collectedData[i] = data
}(i, ids)
wg.Wait() // Wait until all jobs are done.
// make a write buffer
w := bufio.NewWriter(fo)
for _, data := range collectedData {
if _, err := w.Write(data); err != nil {
fmt.Println("can't write data to file: ", err)
if err = w.Flush(); err != nil {
