273 lines
10 KiB
Python
273 lines
10 KiB
Python
# Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Component for automatically creating masks of changing areas of a website.
|
|
|
|
Works by repeated invokation of a browser and scraping of the resulting page.
|
|
Areas that differ will be added to the auto-generated mask. The mask generator
|
|
considers the mask complete when further scrapes fail to produce any differences
|
|
in the mask.
|
|
"""
|
|
|
|
import os # Functions for walking the directory tree
|
|
import tempfile # Get a temporary directory to hold intermediates
|
|
import time # Used for sleep() and naming masks by time
|
|
|
|
import command_line
|
|
import drivers
|
|
from PIL import Image
|
|
from PIL import ImageChops
|
|
import scrapers
|
|
|
|
|
|
def CreateCommand(cmdline):
|
|
"""Inserts the command and arguments into a command line for parsing."""
|
|
cmd = cmdline.AddCommand(
|
|
["maskmaker"],
|
|
"Automatically generates a mask from a list of URLs",
|
|
ValidateMaskmaker,
|
|
ExecuteMaskmaker)
|
|
|
|
cmd.AddArgument(
|
|
["-bp", "--browserpath"], "Full path to browser's executable",
|
|
type="readfile", metaname="PATH")
|
|
cmd.AddArgument(
|
|
["-b", "--browser"], "Which browser to use", type="string",
|
|
default="chrome")
|
|
cmd.AddArgument(
|
|
["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
|
|
cmd.AddArgument(
|
|
["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
|
|
required=True)
|
|
cmd.AddArgument(
|
|
["-u", "--url"], "URL to compare")
|
|
cmd.AddArgument(
|
|
["-l", "--list"], "List of URLs to compare", type="readfile")
|
|
cmd.AddMutualExclusion(["--url", "--list"])
|
|
cmd.AddArgument(
|
|
["-s", "--startline"], "First line of URL list", type="int")
|
|
cmd.AddArgument(
|
|
["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
|
|
cmd.AddArgument(
|
|
["-c", "--count"], "Number of lines of URL file to use", type="int")
|
|
cmd.AddDependency("--startline", "--list")
|
|
cmd.AddRequiredGroup(["--url", "--list"])
|
|
cmd.AddDependency("--endline", "--list")
|
|
cmd.AddDependency("--count", "--list")
|
|
cmd.AddMutualExclusion(["--count", "--endline"])
|
|
cmd.AddDependency("--count", "--startline")
|
|
cmd.AddArgument(
|
|
["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
|
|
"finish loading",
|
|
type="int", default=60)
|
|
cmd.AddArgument(
|
|
["-w", "--wait"],
|
|
"Amount of time (in seconds) to wait between successive scrapes",
|
|
type="int", default=60)
|
|
cmd.AddArgument(
|
|
["-sc", "--scrapes"],
|
|
"Number of successive scrapes which must result in no change to a mask "
|
|
"before mask creation is considered complete", type="int", default=10)
|
|
cmd.AddArgument(
|
|
["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
|
|
cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
|
|
cmd.AddArgument(
|
|
["-gu", "--giveup"],
|
|
"Number of times to scrape before giving up", type="int", default=50)
|
|
cmd.AddArgument(
|
|
["-th", "--threshhold"],
|
|
"Percentage of different pixels (0-100) above which the scrape will be"
|
|
"discarded and the mask not updated.", type="int", default=100)
|
|
cmd.AddArgument(
|
|
["--er", "--errors"],
|
|
"Number of times a scrape can fail before giving up on the URL.",
|
|
type="int", default=1)
|
|
|
|
|
|
def ValidateMaskmaker(command):
|
|
"""Validate the arguments to maskmaker. Raises ParseError if failed."""
|
|
executables = [".exe", ".com", ".bat"]
|
|
if command["--browserpath"]:
|
|
if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
|
|
raise command_line.ParseError("Browser filename must be an executable")
|
|
|
|
|
|
def ExecuteMaskmaker(command):
|
|
"""Performs automatic mask generation."""
|
|
|
|
# Get the list of URLs to generate masks for
|
|
class MaskmakerURL(object):
|
|
"""Helper class for holding information about a URL passed to maskmaker."""
|
|
__slots__ = ['url', 'consecutive_successes', 'errors']
|
|
def __init__(self, url):
|
|
self.url = url
|
|
self.consecutive_successes = 0
|
|
self.errors = 0
|
|
|
|
if command["--url"]:
|
|
url_list = [MaskmakerURL(command["--url"])]
|
|
else:
|
|
startline = command["--startline"]
|
|
if command["--count"]:
|
|
endline = startline+command["--count"]
|
|
else:
|
|
endline = command["--endline"]
|
|
url_list = [MaskmakerURL(url.strip()) for url in
|
|
open(command["--list"], "r").readlines()[startline:endline]]
|
|
|
|
complete_list = []
|
|
error_list = []
|
|
|
|
outdir = command["--outdir"]
|
|
scrapes = command["--scrapes"]
|
|
errors = command["--errors"]
|
|
size = command["--size"]
|
|
scrape_pass = 0
|
|
|
|
scrapedir = command["--scrapedir"]
|
|
if not scrapedir: scrapedir = tempfile.gettempdir()
|
|
|
|
# Get the scraper
|
|
scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))
|
|
|
|
# Repeatedly iterate through the list of URLs until either every URL has
|
|
# a successful mask or too many errors, or we've exceeded the giveup limit
|
|
while url_list and scrape_pass < command["--giveup"]:
|
|
# Scrape each URL
|
|
for url in url_list:
|
|
print "Processing %r..." % url.url
|
|
mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")
|
|
|
|
# Load the existing mask. This is in a loop so we can try to recover
|
|
# from error conditions
|
|
while True:
|
|
try:
|
|
mask = Image.open(mask_filename)
|
|
if mask.size != size:
|
|
print " %r already exists and is the wrong size! (%r vs %r)" % (
|
|
mask_filename, mask.size, size)
|
|
mask_filename = "%s_%r%s" % (
|
|
mask_filename[:-4], size, mask_filename[-4:])
|
|
print " Trying again as %r..." % mask_filename
|
|
continue
|
|
break
|
|
except IOError:
|
|
print " %r does not exist, creating" % mask_filename
|
|
mask = Image.new("1", size, 1)
|
|
mask.save(mask_filename)
|
|
|
|
# Find the stored scrape path
|
|
mask_scrape_dir = os.path.join(
|
|
scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
|
|
drivers.windowing.PreparePath(mask_scrape_dir)
|
|
|
|
# Find the baseline image
|
|
mask_scrapes = os.listdir(mask_scrape_dir)
|
|
mask_scrapes.sort()
|
|
|
|
if not mask_scrapes:
|
|
print " No baseline image found, mask will not be updated"
|
|
baseline = None
|
|
else:
|
|
baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))
|
|
|
|
mask_scrape_filename = os.path.join(mask_scrape_dir,
|
|
time.strftime("%y%m%d-%H%M%S.bmp"))
|
|
|
|
# Do the scrape
|
|
result = scraper.Scrape(
|
|
[url.url], mask_scrape_dir, size, (0, 0),
|
|
command["--timeout"], path=command["--browserpath"],
|
|
filename=mask_scrape_filename)
|
|
|
|
if result:
|
|
# Return value other than None means an error
|
|
print " Scrape failed with error '%r'" % result
|
|
url.errors += 1
|
|
if url.errors >= errors:
|
|
print " ** Exceeded maximum error count for this URL, giving up"
|
|
continue
|
|
|
|
# Load the new scrape
|
|
scrape = Image.open(mask_scrape_filename)
|
|
|
|
# Calculate the difference between the new scrape and the baseline,
|
|
# subject to the current mask
|
|
if baseline:
|
|
diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
|
|
mask.convert(scrape.mode))
|
|
|
|
# If the difference is none, there's nothing to update
|
|
if max(diff.getextrema()) == (0, 0):
|
|
print " Scrape identical to baseline, no change in mask"
|
|
url.consecutive_successes += 1
|
|
if url.consecutive_successes >= scrapes:
|
|
print " ** No change for %r scrapes, done!" % scrapes
|
|
else:
|
|
# convert the difference to black and white, then change all
|
|
# black pixels (where the scrape and the baseline were identical)
|
|
# to white, all others (where the scrape and the baseline differed)
|
|
# to black.
|
|
#
|
|
# Since the below command is a little unclear, here's how it works.
|
|
# 1. convert("L") converts the RGB image to grayscale
|
|
# 2. point() maps grayscale values (or the individual channels)
|
|
# of an RGB image) to different ones. Because it operates on
|
|
# individual channels, the grayscale conversion from step 1
|
|
# is necessary.
|
|
# 3. The "1" second parameter to point() outputs the result as
|
|
# a monochrome bitmap. If the original RGB image were converted
|
|
# directly to monochrome, PIL would dither it.
|
|
diff = diff.convert("L").point([255]+[0]*255, "1")
|
|
|
|
# count the number of different pixels
|
|
diff_pixels = diff.getcolors()[0][0]
|
|
|
|
# is this too much?
|
|
diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
|
|
if diff_pixel_percent > command["--threshhold"]:
|
|
print (" Scrape differed from baseline by %.2f percent, ignoring"
|
|
% diff_pixel_percent)
|
|
else:
|
|
print " Scrape differed in %d pixels, updating mask" % diff_pixels
|
|
mask = ImageChops.multiply(mask, diff)
|
|
mask.save(mask_filename)
|
|
|
|
# reset the number of consecutive "good" scrapes
|
|
url.consecutive_successes = 0
|
|
|
|
# Remove URLs whose mask is deemed done
|
|
complete_list.extend(
|
|
[url for url in url_list if url.consecutive_successes >= scrapes])
|
|
error_list.extend(
|
|
[url for url in url_list if url.errors >= errors])
|
|
url_list = [
|
|
url for url in url_list if
|
|
url.consecutive_successes < scrapes and
|
|
url.errors < errors]
|
|
|
|
scrape_pass += 1
|
|
print "**Done with scrape pass %d\n" % scrape_pass
|
|
|
|
if scrape_pass >= command["--giveup"]:
|
|
print "**Exceeded giveup threshhold. Giving up."
|
|
else:
|
|
print "Waiting %d seconds..." % command["--wait"]
|
|
time.sleep(command["--wait"])
|
|
|
|
print
|
|
print "*** MASKMAKER COMPLETE ***"
|
|
print "Summary report:"
|
|
print " %d masks successfully generated" % len(complete_list)
|
|
for url in complete_list:
|
|
print " ", url.url
|
|
print " %d masks failed with too many errors" % len(error_list)
|
|
for url in error_list:
|
|
print " ", url.url
|
|
if scrape_pass >= command["--giveup"]:
|
|
print (" %d masks were not completed before "
|
|
"reaching the giveup threshhold" % len(url_list))
|
|
for url in url_list:
|
|
print " ", url.url
|