The site that I am scraping has 6000+ urls which translates to 6000+ music genres. Each url directs me to the description of that music genre which then saves it to a csv file. When I scrap more than 1000 descriptions the program stops executing.
package main
import (
"bufio"
"fmt"
"os"
"github.com/gocolly/colly"
)
func main() {
file, err := os.Create("genre_descriptions.csv")
if err != nil {
fmt.Println("Error creating file:", err)
return
}
defer file.Close()
// Create a buffered writer
writer := bufio.NewWriter(file)
defer writer.Flush()
if _, err := writer.WriteString(fmt.Sprintf("genre,link,descriptionn")); err != nil {
fmt.Println("Error writing to file:", err)
}
// Instantiate default collector
c := colly.NewCollector(
// colly.Async(true),
)
// c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 10})
// Find and visit all links
c.OnHTML("li[class='capital-letter genre-term'] > a", func(e *colly.HTMLElement) {
link := e.Attr("href")
d := colly.NewCollector()
d.OnHTML("p[class='genre-desc']", func(h *colly.HTMLElement) {
description := h.Text
if _, err := writer.WriteString(fmt.Sprintf("%v,%s,"%s"n", e.Text, h.Request.URL, description)); err != nil {
fmt.Println("Error writing to file:", err)
}
})
d.Visit(link)
d.Wait()
// Flush the writer to ensure all data is written to file
if err := writer.Flush(); err != nil {
fmt.Println("Error flushing writer:", err)
}
})
c.Visit("https://www.chosic.com/list-of-music-genres/")
c.Wait()
}
I tried using gocolly and goquery at the same time. But instantly get too many requests error since I am scraping using go routines. I also tried Geziyor also has the same problem.