shaka-packager/tools/site_compare/scrapers/chrome/chromebase.py

200 lines
5.2 KiB
Python
Executable File

#!/usr/bin/env python
# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Does scraping for all currently-known versions of Chrome"""
import pywintypes
import types
from drivers import keyboard
from drivers import mouse
from drivers import windowing
# TODO: this has moved, use some logic to find it. For now,
# expects a subst k:.
DEFAULT_PATH = r"k:\chrome.exe"
def InvokeBrowser(path):
"""Invoke the Chrome browser.
Args:
path: full path to browser
Returns:
A tuple of (main window, process handle, address bar, render pane)
"""
# Reuse an existing instance of the browser if we can find one. This
# may not work correctly, especially if the window is behind other windows.
# TODO(jhaas): make this work with Vista
wnds = windowing.FindChildWindows(0, "Chrome_XPFrame")
if len(wnds):
wnd = wnds[0]
proc = None
else:
# Invoke Chrome
(proc, wnd) = windowing.InvokeAndWait(path)
# Get windows we'll need
address_bar = windowing.FindChildWindow(wnd, "Chrome_AutocompleteEdit")
render_pane = GetChromeRenderPane(wnd)
return (wnd, proc, address_bar, render_pane)
def Scrape(urls, outdir, size, pos, timeout, kwargs):
"""Invoke a browser, send it to a series of URLs, and save its output.
Args:
urls: list of URLs to scrape
outdir: directory to place output
size: size of browser window to use
pos: position of browser window
timeout: amount of time to wait for page to load
kwargs: miscellaneous keyword args
Returns:
None if success, else an error string
"""
if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
else: path = DEFAULT_PATH
(wnd, proc, address_bar, render_pane) = InvokeBrowser(path)
# Resize and reposition the frame
windowing.MoveAndSizeWindow(wnd, pos, size, render_pane)
# Visit each URL we're given
if type(urls) in types.StringTypes: urls = [urls]
timedout = False
for url in urls:
# Double-click in the address bar, type the name, and press Enter
mouse.ClickInWindow(address_bar)
keyboard.TypeString(url, 0.1)
keyboard.TypeString("\n")
# Wait for the page to finish loading
load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout)
timedout = load_time < 0
if timedout:
break
# Scrape the page
image = windowing.ScrapeWindow(render_pane)
# Save to disk
if "filename" in kwargs:
if callable(kwargs["filename"]):
filename = kwargs["filename"](url)
else:
filename = kwargs["filename"]
else:
filename = windowing.URLtoFilename(url, outdir, ".bmp")
image.save(filename)
if proc:
windowing.SetForegroundWindow(wnd)
# Send Alt-F4, then wait for process to end
keyboard.TypeString(r"{\4}", use_modifiers=True)
if not windowing.WaitForProcessExit(proc, timeout):
windowing.EndProcess(proc)
return "crashed"
if timedout:
return "timeout"
return None
def Time(urls, size, timeout, kwargs):
"""Measure how long it takes to load each of a series of URLs
Args:
urls: list of URLs to time
size: size of browser window to use
timeout: amount of time to wait for page to load
kwargs: miscellaneous keyword args
Returns:
A list of tuples (url, time). "time" can be "crashed" or "timeout"
"""
if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
else: path = DEFAULT_PATH
proc = None
# Visit each URL we're given
if type(urls) in types.StringTypes: urls = [urls]
ret = []
for url in urls:
try:
# Invoke the browser if necessary
if not proc:
(wnd, proc, address_bar, render_pane) = InvokeBrowser(path)
# Resize and reposition the frame
windowing.MoveAndSizeWindow(wnd, (0,0), size, render_pane)
# Double-click in the address bar, type the name, and press Enter
mouse.ClickInWindow(address_bar)
keyboard.TypeString(url, 0.1)
keyboard.TypeString("\n")
# Wait for the page to finish loading
load_time = windowing.WaitForThrobber(wnd, (20, 16, 36, 32), timeout)
timedout = load_time < 0
if timedout:
load_time = "timeout"
# Send an alt-F4 to make the browser close; if this times out,
# we've probably got a crash
windowing.SetForegroundWindow(wnd)
keyboard.TypeString(r"{\4}", use_modifiers=True)
if not windowing.WaitForProcessExit(proc, timeout):
windowing.EndProcess(proc)
load_time = "crashed"
proc = None
except pywintypes.error:
proc = None
load_time = "crashed"
ret.append( (url, load_time) )
if proc:
windowing.SetForegroundWindow(wnd)
keyboard.TypeString(r"{\4}", use_modifiers=True)
if not windowing.WaitForProcessExit(proc, timeout):
windowing.EndProcess(proc)
return ret
def main():
# We're being invoked rather than imported, so run some tests
path = r"c:\sitecompare\scrapes\chrome\0.1.97.0"
windowing.PreparePath(path)
# Scrape three sites and save the results
Scrape([
"http://www.microsoft.com",
"http://www.google.com",
"http://www.sun.com"],
path, (1024, 768), (0, 0))
return 0
if __name__ == "__main__":
sys.exit(main())