check for duplicate urls

This commit is contained in:
Dave Machado 2017-07-12 09:32:38 -04:00
parent 80e8997d86
commit 5fbf817c1c

View file

@ -1,31 +1,55 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
require 'faraday' require 'httparty'
require 'uri' require 'uri'
allowed_codes = [200, 302, 403] allowed_codes = [200, 302, 403]
args = ARGV args = ARGV
filename = args[0] filename = args[0]
fail_flag = false fail_flag = false
contents = File.open(filename, 'rb') { |f| f.read } contents = File.open(filename, 'rb') { |f| f.read }
links = URI.extract(contents, ['http', 'https']) raw_links = URI.extract(contents, ['http', 'https'])
dup = links.select{|element| links.count(element) > 1 } # Remove trailing ')' from entry URLs
if dup.uniq.length > 0 links = []
dup.uniq.each do |link| raw_links.each do |link|
if link.end_with?(')')
puts link[0...-1]
end
end
exit(1)
end
links.each do |link|
if link.end_with?(')') if link.end_with?(')')
link = link[0...-1] links.push(link[0...-1])
else
links.push(link)
end end
res = Faraday.get(link) end
if !allowed_codes.include?(res.status) # Fail on any duplicate elements
puts "(#{res.status}): #{link}" dup = links.select{|element| links.count(element) > 1}
if dup.uniq.length > 0
dup.uniq.each do |e|
puts "Duplicate link: #{e}"
end
fail_flag = true
end
# Remove any duplicates from array
links = links.uniq
count = 0
total = links.length
fails = []
# GET each link and check for valid response code from allowed_codes
links.each do |link|
begin
count += 1
puts "(#{count}/#{total}) #{link}"
res = HTTParty.get(link, timeout: 10)
if !allowed_codes.include?(res.code)
fails.push("(#{res.code}): #{link}")
fail_flag = true
else
puts "\t(#{res.code})"
end
rescue
puts "FAIL: (#{res.code}) #{link}"
fails.push("(#{res.code}): #{link}")
fail_flag = true fail_flag = true
end end
end end
fails.each do |e|
puts e
end
if fail_flag if fail_flag
exit(1) exit(1)
else else