Advertisement
xosski

GitHub querying

Dec 13th, 2024
14
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.05 KB | None | 0 0
  1. package main
  2.  
  3. import (
  4. "bufio"
  5. "fmt"
  6. "io"
  7. "log"
  8. "net/http"
  9. "os"
  10. "regexp"
  11. "strings"
  12. "sync"
  13. "time"
  14. )
  15.  
  16. func main() {
  17. inputFile := "/home/kali/Desktop/filename.repo.txt"
  18. outputFile := "/home/kali/Desktop/repo.txt"
  19.  
  20. // Open the input file
  21. file, err := os.Open(inputFile)
  22. if err != nil {
  23. log.Fatalf("Error opening input file: %v\n", err)
  24. }
  25. defer file.Close()
  26.  
  27. // Create or truncate the output file
  28. out, err := os.Create(outputFile)
  29. if err != nil {
  30. log.Fatalf("Error creating output file: %v\n", err)
  31. }
  32. defer out.Close()
  33.  
  34. // Prepare a writer for the output file
  35. writer := bufio.NewWriter(out)
  36.  
  37. // Prepare a wait group for concurrency
  38. var wg sync.WaitGroup
  39.  
  40. // Create a buffered channel to limit the number of concurrent requests
  41. concurrencyLimit := 10
  42. semaphore := make(chan struct{}, concurrencyLimit)
  43.  
  44. // Read each line from the input file
  45. scanner := bufio.NewScanner(file)
  46. for scanner.Scan() {
  47. query := strings.TrimSpace(scanner.Text())
  48. if query == "" {
  49. continue
  50. }
  51.  
  52. // Increment wait group for each query
  53. wg.Add(1)
  54.  
  55. // Acquire a spot in the semaphore before sending the request
  56. semaphore <- struct{}{}
  57.  
  58. // Go routine to handle each query concurrently
  59. go func(query string) {
  60. defer wg.Done() // Decrement wait group when done
  61. // Construct the search URL
  62. url := fmt.Sprintf("http://github.com/search?q=%s&type=repositories", query)
  63.  
  64. // Perform the HTTP GET request
  65. resp, err := http.Get(url)
  66. if err != nil {
  67. log.Printf("Error querying GitHub for %s: %v\n", query, err)
  68. <-semaphore // Release the semaphore spot
  69. return
  70. }
  71.  
  72. // Handle rate limit
  73. if resp.StatusCode == 429 {
  74. log.Printf("Rate limit exceeded for %s. Sleeping for 30 seconds...\n", query)
  75. resp.Body.Close()
  76. <-semaphore // Release the semaphore spot
  77. time.Sleep(30 * time.Second) // Wait before retrying
  78. return
  79. }
  80.  
  81. // Handle non-OK status codes
  82. if resp.StatusCode != http.StatusOK {
  83. log.Printf("Non-OK HTTP status for %s: %d\n", query, resp.StatusCode)
  84. resp.Body.Close()
  85. <-semaphore // Release the semaphore spot
  86. return
  87. }
  88.  
  89. // Parse and write results
  90. parseAndWriteResults(resp.Body, writer)
  91.  
  92. // Close the response body
  93. resp.Body.Close()
  94.  
  95. // Add a delay to avoid hitting rate limits
  96. time.Sleep(1 * time.Second) // 1 second delay between requests
  97.  
  98. // Release the semaphore spot
  99. <-semaphore
  100. }(query)
  101. }
  102.  
  103. // Wait for all goroutines to finish
  104. wg.Wait()
  105.  
  106. if err := scanner.Err(); err != nil {
  107. log.Printf("Error reading input file: %v\n", err)
  108. }
  109.  
  110. // Flush the writer buffer
  111. if err := writer.Flush(); err != nil {
  112. log.Printf("Error flushing output file: %v\n", err)
  113. }
  114. }
  115.  
  116. func parseAndWriteResults(body io.Reader, writer *bufio.Writer) {
  117. // Read the body as a string
  118. buf := new(strings.Builder)
  119. _, err := io.Copy(buf, body)
  120. if err != nil {
  121. log.Printf("Error reading response body: %v\n", err)
  122. return
  123. }
  124. content := buf.String()
  125.  
  126. // Define the regex pattern to extract valid repo links
  127. pattern := `href="/([^/]+/[^/]+)"`
  128.  
  129. // Compile the regex
  130. re := regexp.MustCompile(pattern)
  131.  
  132. // Find all matches
  133. matches := re.FindAllStringSubmatch(content, -1)
  134.  
  135. // Define unwanted substrings to filter out
  136. unwanted := []string{
  137. "opensearch.xml", "manifest.json", "login?return_to=", "resources/",
  138. "sponsors", "stargazers", "enterprise", "features/copilot#enterprise",
  139. "premium-support", "signup",
  140. }
  141.  
  142. // Write filtered matches to the output file
  143. for _, match := range matches {
  144. if len(match) > 1 {
  145. fullURL := "https://github.com/" + match[1]
  146. if !isUnwanted(fullURL, unwanted) {
  147. _, err := writer.WriteString(fullURL + "\n")
  148. if err != nil {
  149. log.Printf("Error writing to output file: %v\n", err)
  150. return
  151. }
  152. }
  153. }
  154. }
  155. }
  156.  
  157. // Helper function to check if a URL is unwanted
  158. func isUnwanted(url string, unwanted []string) bool {
  159. for _, u := range unwanted {
  160. if strings.Contains(url, u) {
  161. return true
  162. }
  163. }
  164. return false
  165. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement