Browse Source

comics: started work on downloading the alt text for each comic

Signed-off-by: gryffyn <me@neveris.one>
main
gryffyn 2 months ago
parent
commit
628baa1d55
Signed by: gryffyn GPG Key ID: 6948DD6514D02BEF
  1. 13
      cmd/cmd.go
  2. 2
      comics/doa.go
  3. 49
      comics/ggar.go
  4. 26
      dlstrip/dlstrip.go
  5. 2
      go.mod

13
cmd/cmd.go

@ -18,6 +18,7 @@ import (
func Run() {
var comic, dir, first, last string
var text bool
app := &cli.App{
Name: "comicscraper",
@ -54,6 +55,12 @@ func Run() {
Usage: "number/date of the last comic",
Destination: &last,
},
&cli.BoolFlag{
Name: "text",
Aliases: []string{"t"},
Usage: "associated text of comic",
Destination: &text,
},
},
Action: func(c *cli.Context) error {
var err error
@ -92,11 +99,17 @@ func Run() {
if last == "" {
bar := progressbar.Default(1)
err = comics.GetGGARStrip(fi, dir, bar)
if text {
err = comics.GetGGARText(fi, dir)
}
} else {
li, _ := strconv.Atoi(last)
max := li - fi + 1
bar := progressbar.Default(int64(max - 1))
err = dlstrip.GetAllInt(comics.GenIntArray(fi, li), dir, bar, comics.GetGGARStrip)
if text {
err = dlstrip.GetAllText(comics.GenIntArray(fi, li), dir, comics.GetGGARText)
}
}
}
fmt.Println("\nFinished downloading.")

2
comics/doa.go

@ -12,7 +12,7 @@ import (
"golang.org/x/net/html"
)
const StartDate_DOA = "2010-09-06"
// StartDate_DOA = "2010-09-06"
func GetDOAStrip(strip time.Time, filepath string, bar *progressbar.ProgressBar) error {
layout := "2006-01-02"

49
comics/ggar.go

@ -10,7 +10,9 @@ import (
"regexp"
"strings"
strip "github.com/grokify/html-strip-tags-go"
"github.com/schollz/progressbar/v3"
"golang.org/x/net/html"
)
//go:embed ggar-titles
@ -46,3 +48,50 @@ func GetGGARStrip(stripIndex int, filepath string, bar *progressbar.ProgressBar)
bar.Add(1)
return err
}
func GetGGARText(stripName string, fileindex int, filepath string) error {
resp, err := http.Get("https://www.gogetaroomie.com/ggar-rerun/" + stripName)
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
getComment := regexp.MustCompile(`cc-newsbody"\>(.*)You can find these two strips in the old archive`)
commentnobsp := strings.Replace(getComment.FindStringSubmatch(string(body))[1], " ", " ", -1)
commentnobsp2 := strings.Replace(commentnobsp, "&nbsp;", " ", -1)
cnbspbr := strings.Replace(commentnobsp2, `<br>`, "\n", -1)
comment := strip.StripTags(cnbspbr)
var out *os.File
out, err = os.Create(filepath + fmt.Sprintf("%04d", fileindex) + "-" + stripName + ".txt")
if err != nil {
return err
}
defer out.Close()
_, err = out.WriteString(comment)
if err != nil {
return err
}
return err
}
func GetGGARTitles(series string) ([]string, error) {
var titles []string
client := &http.Client{}
req, err := http.NewRequest("GET", "https://www.gogetaroomie.com/"+series+"/archive", nil)
resp, err := client.Do(req)
defer resp.Body.Close()
doc, _ := html.Parse(resp.Body)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "option" {
if n.Attr[0].Val != "" {
titles = append(titles, strings.TrimPrefix(n.Attr[0].Val, "ggar-rerun/"))
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return titles, err
}

26
dlstrip/dlstrip.go

@ -9,6 +9,7 @@ import (
type dlStripDate func(time.Time, string, *progressbar.ProgressBar) error
type dlStripInt func(int, string, *progressbar.ProgressBar) error
type dlStripIntNB func(int, string) error
func GetAllDate(arr []time.Time, filepath string, bar *progressbar.ProgressBar, dlstrip dlStripDate) error {
total := len(arr)
@ -59,3 +60,28 @@ func GetAllInt(arr []int, filepath string, bar *progressbar.ProgressBar, dlstrip
wg.Wait()
return err
}
func GetAllText(arr []int, filepath string, dlstrip dlStripIntNB) error {
total := len(arr)
size := total / 4
rmdr := total % 4
wg := &sync.WaitGroup{}
var err error
for i := 0; i < 4; i++ {
wg.Add(1)
start := i * size
end := (i + 1) * size
if i == 3 {
end += rmdr
}
go func(start, end, i int) {
for f := start; f < end; f++ {
err = dlstrip(arr[f], filepath)
}
wg.Done()
}(start, end, i)
}
wg.Wait()
return err
}

2
go.mod

@ -3,7 +3,7 @@ module git.neveris.one/gryffyn/comicscraper
go 1.16
require (
github.com/davecgh/go-spew v1.1.1
github.com/grokify/html-strip-tags-go v0.0.1
github.com/schollz/progressbar/v3 v3.7.5
github.com/urfave/cli/v2 v2.3.0
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3

Loading…
Cancel
Save