Skip to content

Commit

Permalink
Complete tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Uğur Özyılmazel committed Sep 7, 2020
1 parent 5e58ed6 commit 346493a
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 9 deletions.
68 changes: 67 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,70 @@ $ gem install scraper_rb --version "0.0.0" --source "https://rubygems.pkg.github

## Example Usage

@wip
Basic scraper:

```ruby
require "scraper_rb"

s = ScraperRb.new('https://pypi.org/classifiers/') # no params
s.get
s.response
# {
# :headers=>{:"Content-Length"=>...},
# :url=>"https://pypi.org/classifiers/",
# :data=>"<!DOCTYPE html>\n<html> ...",
# }

s.response[:headers] # => return response headers
s.response[:data] # => return scraped html
s.save('/tmp/data.html') # => {:file=>"/tmp/data.html", :size=>321322}

# or

save_result = s.save('/tmp/data.html')
puts save_result[:error] if save_result.key?(:error) # we have a file error
```

You can add url parameters for extra operations. Valid parameters are:

- `auth_password`: for HTTP Realm auth password
- `auth_username`: for HTTP Realm auth username
- `cookie`: URL Encoded cookie header.
- `country`: 2 character country code. If you wish to scrape from an IP address of a specific country.
- `referer`: HTTP referer header
- `selector`: CSS style selector path such as `a.btn div li`. If `selector`
is enabled, returning result will be collection of data and saved file
will be in `.json` format.

Here is an example with using url parameters and `selector`:

```ruby
require "scraper_rb"

params = {country: 'EE', selector: 'ul li button[data-clipboard-text]'}
s = ScraperRb.new('https://pypi.org/classifiers/', params)
s.get
s.response[:headers] # => return response headers
s.response[:data] # => return an array, collection of given selector
s.response[:data].length # => 734
s.save('/tmp/test.json') # => {:file=>"/tmp/test.json", :size=>174449}

# or

save_result = s.save('/tmp/test.json')
puts save_result[:error] if save_result.key?(:error) # we have a file error
```

Default **timeout** value is set to `10` seconds. You can change this while
initializing the instance:

```ruby
s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=50)
# => 50 seconds timeout w/o params

s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE'}, timeout=50)
# => 50 seconds timeout
```

---

Expand Down Expand Up @@ -56,6 +119,9 @@ rake release[remote] # Create tag v0.0.0 and build and push bin_checker_rb-X.X.
rake test # Run tests
```

- If you have `PROMPTAPI_TOKEN` you’ll have real http request based tests available.
- Set `RUBY_DEVELOPMENT` to `1` for more verbose test results

---

## License
Expand Down
29 changes: 25 additions & 4 deletions lib/scraper_rb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ def call(env)
end

class << self
def new(url, params={})
def new(url, params={}, timeout=10)
puts "params: #{params}" if ENV['RUBY_DEVELOPMENT']
ScraperRb::Scraper.new(url, params)
puts "timeout: #{timeout}" if ENV['RUBY_DEVELOPMENT']
ScraperRb::Scraper.new(url, params, timeout)
end
end

Expand All @@ -31,13 +32,15 @@ class Scraper

attr_accessor :options, :response

def initialize(url, params=nil, timeout=10)
def initialize(url, params, timeout)
params = {} if params == nil
@options = {
url: ENV['PROMPTAPI_TEST_ENDPOINT'] || 'https://api.promptapi.com/scraper',
params: {url: url},
request: {timeout: timeout},
headers: {'Accept' => 'application/json', 'apikey' => ENV['PROMPTAPI_TOKEN']},
}
puts "-> params: #{params}"
params.each do |key, value|
@options[:params][key] = value if VALID_PARAMS.map(&:to_sym).include?(key)
end
Expand Down Expand Up @@ -79,6 +82,24 @@ def get
end
end

end
def save(filename)
return {error: 'Data is not available'} unless @response[:data]
save_extension = '.html'
save_data = @response[:data]
if @response[:data].class == Array
save_extension = '.json'
save_data = JSON.generate(@response[:data])
end
file_dirname = File.dirname(filename)
file_basename = File.basename(filename, save_extension)
file_savename = "#{file_dirname}/#{file_basename}#{save_extension}"
begin
File.open(file_savename, 'w') {|file| file.write(save_data)}
return {file: file_savename, size: File.size(file_savename)}
rescue Errno::ENOENT => e
return {error: "#{e}"}
end
end

end
end
42 changes: 38 additions & 4 deletions test/scraper_rb_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,60 @@ def test_promptapi_token
def test_scrape_with_basic_params
skip "PROMPTAPI_TOKEN required, skipping test..." unless ENV['PROMPTAPI_TOKEN']

s = ScraperRb.new('https://vbyazilim.com', {country: 'EE'})
s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE'})
s.get

assert s.response
assert s.response.fetch(:headers)
assert s.response.fetch(:data)
assert s.response.fetch(:url)

result = s.save('/tmp/test.html')
assert result.fetch(:file)
assert result.fetch(:size)
assert result.fetch(:size) > 300 * 1024
end

def test_scrape_with_selector_param
skip "PROMPTAPI_TOKEN required, skipping test..." unless ENV['PROMPTAPI_TOKEN']

mega_selector = 'body > section.section.main.has-white-background > div > div > div:nth-child(2) > div > div > div > ul > li'
s = ScraperRb.new('https://vbyazilim.com', {country: 'EE', selector: mega_selector})
s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE', selector: 'ul li button[data-clipboard-text]'})
s.get

assert s.response
assert s.response.fetch(:headers)
assert s.response.fetch(:data)
assert s.response.fetch(:url)
assert_equal s.response[:data].class, Array
assert s.response[:data].length > 5
assert s.response[:data].length > 700

result = s.save('/tmp/test.json')
assert result.fetch(:file)
assert result.fetch(:size)
assert result.fetch(:size) > 512

error_result = s.save('/tmp-fake/dir/test.json')
refute_nil error_result.fetch(:error)
assert_equal error_result[:error], 'No such file or directory @ rb_sysopen - /tmp-fake/dir/test.json'
end


def test_scrape_with_timeout
skip "PROMPTAPI_TOKEN required, skipping test..." unless ENV['PROMPTAPI_TOKEN']

s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=50)
s.get

assert s.response
assert s.response.fetch(:headers)
assert s.response.fetch(:data)
assert s.response.fetch(:url)

s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=1)
s.get

refute_nil s.response.fetch(:error)
assert_equal s.response.fetch(:error), "Net::readtimeout with #<tcpsocket:(closed)>"
end

end

0 comments on commit 346493a

Please sign in to comment.