Complete tests

promptapi · Sep 7, 2020 · 346493a · 346493a
1 parent 5e58ed6
commit 346493a
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -28,7 +28,70 @@ $ gem install scraper_rb --version "0.0.0" --source "https://rubygems.pkg.github
 
 ## Example Usage
 
-@wip
+Basic scraper:
+
+```ruby
+require "scraper_rb"
+
+s = ScraperRb.new('https://pypi.org/classifiers/') # no params
+s.get
+s.response
+# {
+#     :headers=>{:"Content-Length"=>...}, 
+#     :url=>"https://pypi.org/classifiers/",
+#     :data=>"<!DOCTYPE html>\n<html> ...",
+# }
+
+s.response[:headers]     # => return response headers
+s.response[:data]        # => return scraped html
+s.save('/tmp/data.html') # => {:file=>"/tmp/data.html", :size=>321322}
+
+# or
+
+save_result = s.save('/tmp/data.html')
+puts save_result[:error] if save_result.key?(:error) # we have a file error
+```
+
+You can add url parameters for extra operations. Valid parameters are:
+
+- `auth_password`: for HTTP Realm auth password
+- `auth_username`: for HTTP Realm auth username
+- `cookie`: URL Encoded cookie header.
+- `country`: 2 character country code. If you wish to scrape from an IP address of a specific country.
+- `referer`: HTTP referer header
+- `selector`: CSS style selector path such as `a.btn div li`. If `selector`
+  is enabled, returning result will be collection of data and saved file
+  will be in `.json` format.
+
+Here is an example with using url parameters and `selector`:
+
+```ruby
+require "scraper_rb"
+
+params = {country: 'EE', selector: 'ul li button[data-clipboard-text]'}
+s = ScraperRb.new('https://pypi.org/classifiers/', params)
+s.get
+s.response[:headers]       # => return response headers
+s.response[:data]          # => return an array, collection of given selector
+s.response[:data].length   # => 734 
+s.save('/tmp/test.json')   # => {:file=>"/tmp/test.json", :size=>174449}
+
+# or
+
+save_result = s.save('/tmp/test.json')
+puts save_result[:error] if save_result.key?(:error) # we have a file error
+```
+
+Default **timeout** value is set to `10` seconds. You can change this while
+initializing the instance:
+
+```ruby
+s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=50) 
+# => 50 seconds timeout w/o params
+
+s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE'}, timeout=50) 
+# => 50 seconds timeout
+```
 
 ---
 
@@ -56,6 +119,9 @@ rake release[remote]  # Create tag v0.0.0 and build and push bin_checker_rb-X.X.
 rake test             # Run tests
 ```
 
+- If you have `PROMPTAPI_TOKEN` you’ll have real http request based tests available.
+- Set `RUBY_DEVELOPMENT` to `1` for more verbose test results
+
 ---
 
 ## License

diff --git a/lib/scraper_rb.rb b/lib/scraper_rb.rb
@@ -20,9 +20,10 @@ def call(env)
   end
 
   class << self
-    def new(url, params={})
+    def new(url, params={}, timeout=10)
       puts "params: #{params}" if ENV['RUBY_DEVELOPMENT']
-      ScraperRb::Scraper.new(url, params)
+      puts "timeout: #{timeout}" if ENV['RUBY_DEVELOPMENT']
+      ScraperRb::Scraper.new(url, params, timeout)
     end
   end
 
@@ -31,13 +32,15 @@ class Scraper
 
     attr_accessor :options, :response
 
-    def initialize(url, params=nil, timeout=10)
+    def initialize(url, params, timeout)
+      params = {} if params == nil
       @options = {
         url: ENV['PROMPTAPI_TEST_ENDPOINT'] || 'https://api.promptapi.com/scraper',
         params: {url: url},
         request: {timeout: timeout},
         headers: {'Accept' => 'application/json', 'apikey' => ENV['PROMPTAPI_TOKEN']},
       }
+      puts "-> params: #{params}"
       params.each do |key, value|
         @options[:params][key] = value if VALID_PARAMS.map(&:to_sym).include?(key)
       end
@@ -79,6 +82,24 @@ def get
       end
     end
 
-  end
+    def save(filename)
+      return {error: 'Data is not available'} unless @response[:data]
+      save_extension = '.html'
+      save_data = @response[:data]
+      if @response[:data].class == Array
+        save_extension = '.json'
+        save_data = JSON.generate(@response[:data])
+      end
+      file_dirname = File.dirname(filename)
+      file_basename = File.basename(filename, save_extension)
+      file_savename = "#{file_dirname}/#{file_basename}#{save_extension}"
+      begin
+        File.open(file_savename, 'w') {|file| file.write(save_data)}
+        return {file: file_savename, size: File.size(file_savename)}
+      rescue Errno::ENOENT => e
+        return {error: "#{e}"}
+      end
+    end
 
+  end
 end
diff --git a/test/scraper_rb_test.rb b/test/scraper_rb_test.rb
@@ -17,26 +17,60 @@ def test_promptapi_token
   def test_scrape_with_basic_params
     skip "PROMPTAPI_TOKEN required, skipping test..." unless ENV['PROMPTAPI_TOKEN']
 
-    s = ScraperRb.new('https://vbyazilim.com', {country: 'EE'})
+    s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE'})
     s.get
+
     assert s.response
     assert s.response.fetch(:headers)
     assert s.response.fetch(:data)
     assert s.response.fetch(:url)
+
+    result = s.save('/tmp/test.html')
+    assert result.fetch(:file)
+    assert result.fetch(:size)
+    assert result.fetch(:size) > 300 * 1024
   end
 
   def test_scrape_with_selector_param
     skip "PROMPTAPI_TOKEN required, skipping test..." unless ENV['PROMPTAPI_TOKEN']
 
-    mega_selector = 'body > section.section.main.has-white-background > div > div > div:nth-child(2) > div > div > div > ul > li'
-    s = ScraperRb.new('https://vbyazilim.com', {country: 'EE', selector: mega_selector})
+    s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE', selector: 'ul li button[data-clipboard-text]'})
     s.get
+
     assert s.response
     assert s.response.fetch(:headers)
     assert s.response.fetch(:data)
     assert s.response.fetch(:url)
     assert_equal s.response[:data].class, Array
-    assert s.response[:data].length > 5
+    assert s.response[:data].length > 700
+
+    result = s.save('/tmp/test.json')
+    assert result.fetch(:file)
+    assert result.fetch(:size)
+    assert result.fetch(:size) > 512
+
+    error_result = s.save('/tmp-fake/dir/test.json')
+    refute_nil error_result.fetch(:error)
+    assert_equal error_result[:error], 'No such file or directory @ rb_sysopen - /tmp-fake/dir/test.json'
+  end
+
+
+  def test_scrape_with_timeout
+    skip "PROMPTAPI_TOKEN required, skipping test..." unless ENV['PROMPTAPI_TOKEN']
+
+    s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=50)
+    s.get
+
+    assert s.response
+    assert s.response.fetch(:headers)
+    assert s.response.fetch(:data)
+    assert s.response.fetch(:url)
+
+    s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=1)
+    s.get
+
+    refute_nil s.response.fetch(:error)
+    assert_equal s.response.fetch(:error), "Net::readtimeout with #<tcpsocket:(closed)>"
   end
 
 end