Playwright
- 2025.10.27
- Playwright
pip install playwright
Playwright 基本框架 async/await
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto("...")
title = await page.title()
...
await browser.close()
page / context
- page: 相當於一個瀏覽器 (browser) 中的分頁 (tab)
- context: 一組 session (cookie, localStorage, …)
可由 page.context 取得
Jupyter Notebook (.ipynb) on Windows
在 Windows 的某些執行環境 (如:Jupyter Notebook) 中
預設使用的 asyncio event loop policy (SelectorEventLoopPolicy)不支援 pipes/subprocess transport
因此 Playwright 在嘗試建立 browser subprocess 或 pipe 通訊時會拋出NotImplementedError
import platform, asyncio if platform.system() == "Windows": asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
🙅♂️ sync
import traceback
from playwright.sync_api import sync_playwright
def run_playwright():
try:
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('...')
...
page.wait_for_event("close", timeout=0)
except Exception as e:
print(f"An error occurred: {repr(e)}")
traceback.print_exc()
import threading
thread = threading.Thread(target=run_playwright)
thread.start()
thread.join() # 等 thread 結束才繼續
在 Win + VS Code 的 .ipynb 環境中
主執行緒由 IPython kernel 持有 asyncio event loop
透過另開 thread 將其隔離至不受既有 asyncio event loop 影響的獨立執行環境
使 Playwright Sync/Async API 得以運行
async
import traceback
from playwright.async_api import async_playwright
async def run_playwright():
try:
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto('...')
...
await page.wait_for_event("close", timeout=0)
except Exception as e:
print(f"An error occurred: {repr(e)}")
traceback.print_exc()
import asyncio
def thread_entry():
asyncio.run(run_playwright())
import threading
thread = threading.Thread(target=thread_entry)
thread.start()
thread.join() # 等 thread 結束才繼續
用 Playwright 開啟瀏覽器供人為操作
- 下載並安裝 Playwright 可用的瀏覽器 (Chromium/Firefox/WebKit) 及其對應的 driver
python -m playwright install - 將
launch()的引數headless設為False
browser = p.chromium.launch(headless=False)
透過 Chrome DevTools Protocol 連接瀏覽器
Python 開啟 Chrome
import subprocess, requests, time
import platform, os
project_name = ""
match platform.system():
case 'Windows':
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
user_data_dir = os.path.join(
os.environ["LOCALAPPDATA"],
"Chrome-CDP" + (project_name if project_name == "" else f"_{project_name}")
)
case 'Linux':
chrome_path = 'google-chrome'
os.path.join(os.path.expanduser("~/.config/google-chrome/"), project_name)
cdp_port = 9453
cdp_url = f"http://127.0.0.1:{cdp_port}/json/version"
subprocess.Popen([
chrome_path,
f"--remote-debugging-port={cdp_port}",
f"--user-data-dir={user_data_dir}",
# "--no-first-run", "--no-default-browser-check", "--disable-sync",
# "--headless=new",
])
# 等待 CDP ready
for _ in range(s := 10): # 最多等 s 秒
try:
r = requests.get(cdp_url, timeout=0.2)
if r.status_code == 200:
break
except requests.exceptions.RequestException:
time.sleep(1)
else:
raise RuntimeError("Chrome CDP did not start")
- Chromium Command Line Switches1The Chromium Projects: Run Chromium with command-line switches2List of Chromium Command Line Switches
- “–no-first-run”:跳過首次使用之設定流程”
- –no-default-browser-check”:不再跳出「是否設為預設瀏覽器」
- “–disable-sync”:停用 Google 帳號同步 及 登入相關 UI
Playwright 連接至 啟用 remote debugging 的 Chrome
with ..._playwright() as p: browser = p.chromium.connect_over_cdp(cdp_url) context = browser.contexts[-1] page = context.pages[-1] ...
context
context.cookies()
context.cookies(): list[dict]
傳遞 storage_state
匯出
state = await context.storage_state()- 匯出為檔案:
await context.storage_state(path="... .json")
匯入
context = await browser.new_context(storage_state="... .json") page = await context.new_page() ...
沿用既存的 user_data_dir
async with async_playwright() as p: context = await p.chromium.launch_persistent_context(user_data_dir=user_data_dir) browser = context.browser page = await context.new_page() ...
page
wait
page.wait_for_url()
page.wait_for_url(“**/xxx/**”)
- 等待 URL 變成指定路徑
**:= 萬用字元/任意字串(含/)
當 URL 有變動
page.wait_for_url(lambda url: url != old_url) old_url = page.url
page.expect_navigation()
with page.expect_navigation(): ...
- 等待「瀏覽器層級的導航事件」,包含:
- URL 改變
- document reload
- window.location 改變
- form submit 導致離開頁面
timeout預設為 30000 (ms)- 不設
timeout:with page.expect_navigation(timeout=0):
- 不設
page.wait_for_load_state()
page.wait_for_load_state('domcontentloaded')
- ✅ HTML 已下載並 parse 完成
- ✅ DOM tree 已建立
- 💁♂️ JS、圖片、CSS 可能還在載
page.wait_for_load_state()
≡ page.wait_for_load_state(‘load’)
-
✅ DOM ready
-
✅ 同步 JS 完成
-
🙆♂️ 圖片 / CSS 大多已載入
page.wait_for_load_state('networkidle')
-
⏱️ 連續 500ms 沒有任何 network request
-
⚠️ 不只是主頁,包含 fetch / xhr
page.wait_for_selector()
page.screenshot()
await page.screenshot(path="page.png"[, full_page=True])
- 引數
full_page預設為False
只擷取可視範圍 full_page=True
自動捲動頁面並拼貼為整頁內容
Jupyter Notebook
from IPython.display import Image, display img_bytes = await page.screenshot() display(Image(data=img_bytes))
locator
locator()
nodes = page.locator('<css_selector (default)>')3CSS Selectornodes = page.locator('xpath=<xpath>')4XPath
count(), nth(), first, last, all()
if await nodes.count() > 0 : ...nodes.nth(i):= nods 中第 i + 1 個結果,i = 0, 1, 2, …, nodes.count() – 1nodes.first≡nodes.nth(0)
nodes.last≡nodes.nth(await nodes.count() - 1)- ❌ nodes.first
()
❌ nodes.last() for i in range(await nodes.count()): ...nodes.nth(i)...
≡for node in await nodes.all(): ...subnodes = nodes.locator('<css_selector>')for subnode in await nodes.locator('<css_selector>').all(): ...
get_attribute(), inner_text()
attribute = await node.get_attribute('<attribute>')text = await node.inner_text()
模擬瀏覽器操作
開啟連結
urljoin()
from urllib.parse import urljoin
full_url = urljoin(page.url, await a.get_attribute('href'))
await page.goto(full_url)
click()
locator([selector]).click()
上一頁
await page.go_back()
Last Updated on 2026/01/26 by A1go