From 4b70cef2f97faf1d23b8a3705e0576bb62ec66d1 Mon Sep 17 00:00:00 2001 From: veasion Date: Sun, 15 Dec 2024 22:52:39 +0800 Subject: [PATCH] feat: Computer for general --- src/background/tools/computer.ts | 304 +++++++++++++++++++++++++++ src/background/tools/computer_use.ts | 3 +- src/background/utils.ts | 14 +- src/content/index.ts | 194 ++++++++++++++++- src/script/common.js | 35 +-- 5 files changed, 522 insertions(+), 28 deletions(-) create mode 100644 src/background/tools/computer.ts diff --git a/src/background/tools/computer.ts b/src/background/tools/computer.ts new file mode 100644 index 0000000..13aebeb --- /dev/null +++ b/src/background/tools/computer.ts @@ -0,0 +1,304 @@ +import { Tool, InputSchema } from "../../types/action.types"; +import * as utils from "../utils"; + +/** + * Computer for general + */ +export class Computer implements Tool { + name: string; + description: string; + input_schema: InputSchema; + windowId?: number; + tabId?: number; + + constructor(size: [number, number]) { + this.name = "computer"; + this.description = `Use a mouse and keyboard to interact with a computer, and take screenshots. +* This is a browser GUI interface where you do not have access to the address bar or bookmarks. You must operate the browser using inputs like screenshots, mouse, keyboard, etc. +* Some operations may take time to process, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you clicked submit button, but it didn't work, try taking another screenshot. +* The screen's resolution is ${size[0]}x${size[1]}. +* Whenever you intend to move the cursor to click on an element, you should consult a screenshot to determine the coordinates of the element before moving the cursor. +* If you tried clicking on a button or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. +* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element.`; + this.input_schema = { + type: "object", + properties: { + action: { + type: "string", + description: `The action to perform. The available actions are: +* \`key\`: Press a key or key-combination on the keyboard. +- This supports js KeyboardEvent syntax. +- Multiple keys are combined using the "+" symbol. +- Examples: "a", "Enter", "Ctrl+s", "Meta+Shift+a", "Delete", "0". +* \`type\`: Type a string of text on the keyboard. +* \`cursor_position\`: Get the current (x, y) pixel coordinate of the cursor on the screen. +* \`mouse_move\`: Move the cursor to a specified (x, y) pixel coordinate on the screen. +* \`left_click\`: Click the left mouse button. +* \`left_click_drag\`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen. +* \`right_click\`: Click the right mouse button. +* \`double_click\`: Double-click the left mouse button. +* \`screenshot\`: Take a screenshot of the screen. +* \`scroll_to\`: Scroll to the specified (x, y) pixel coordinate.`, + enum: [ + "key", + "type", + "mouse_move", + "left_click", + "left_click_drag", + "right_click", + "double_click", + "screenshot", + "cursor_position", + "scroll_to", + ], + }, + coordinate: { + type: "array", + description: + "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to.", + }, + text: { + type: "string", + description: "Required only by `action=type` and `action=key`", + }, + }, + required: ["action"], + }; + } + + /** + * computer + * + * @param {*} params { action: 'mouse_move', coordinate: [100, 200] } + * @returns { success: true, coordinate?: [], image?: { type: 'base64', media_type: 'image/jpeg', data: '/9j...' } } + */ + async execute(params: unknown): Promise { + if ( + typeof params !== "object" || + params === null || + !("action" in params) + ) { + throw new Error( + 'Invalid parameters. Expected an object with a "action" property.' + ); + } + let { action, coordinate, text } = params as any; + let tabId = await this.getTabId(); + let result; + switch (action as string) { + case "key": + result = await key(tabId, text, coordinate); + break; + case "type": + result = await type(tabId, text, coordinate); + break; + case "mouse_move": + result = await mouse_move(tabId, coordinate); + break; + case "left_click": + result = await left_click(tabId, coordinate); + break; + case "left_click_drag": + result = await left_click_drag(tabId, coordinate); + break; + case "right_click": + result = await right_click(tabId, coordinate); + break; + case "double_click": + result = await double_click(tabId, coordinate); + break; + case "screenshot": + result = await screenshot(this.windowId); + break; + case "cursor_position": + result = await cursor_position(tabId); + break; + case "scroll_to": + result = await scroll_to(tabId, coordinate); + break; + default: + throw Error( + `Invalid parameters. The "${action}" value is not included in the "action" enumeration.` + ); + } + return { success: true, ...result }; + } + + async getTabId(): Promise { + let tabId = this.tabId; + if (!tabId) { + tabId = await utils.getCurrentTabId(); + } + return tabId as number; + } +} + +export async function key( + tabId: number, + key: string, + coordinate?: [number, number] +) { + if (!coordinate) { + coordinate = (await cursor_position(tabId)).coordinate; + } + await mouse_move(tabId, coordinate); + let mapping: { [key: string]: string } = {}; + let keys = key.replace(/\s+/g, " ").split(" "); + for (let i = 0; i < keys.length; i++) { + let _key = keys[i]; + let keyEvents = { + key: "", + ctrlKey: false, + altKey: false, + shiftKey: false, + metaKey: false, + }; + if (_key.indexOf("+") > -1) { + let mapped_keys = _key.split("+").map((k) => mapping[k] || k); + for (let i = 0; i < mapped_keys.length - 1; i++) { + let k = mapped_keys[i].toLowerCase(); + if (k == "ctrl" || k == "control") { + keyEvents.ctrlKey = true; + } else if (k == "alt" || k == "option") { + keyEvents.altKey = true; + } else if (k == "shift") { + keyEvents.shiftKey = true; + } else if (k == "meta" || k == "command") { + keyEvents.metaKey = true; + } else { + console.log("Unknown Key: " + k); + } + } + keyEvents.key = mapped_keys[mapped_keys.length - 1]; + } else { + keyEvents.key = mapping[_key] || _key; + } + if (!keyEvents.key) { + continue; + } + await chrome.tabs.sendMessage(tabId, { + type: "computer:key", + coordinate, + ...keyEvents, + }); + await utils.sleep(100); + } +} + +export async function type( + tabId: number, + text: string, + coordinate?: [number, number] +) { + if (!coordinate) { + coordinate = (await cursor_position(tabId)).coordinate; + } + await mouse_move(tabId, coordinate); + return await chrome.tabs.sendMessage(tabId, { + type: "computer:type", + text, + coordinate, + }); +} + +export async function mouse_move(tabId: number, coordinate: [number, number]) { + return await chrome.tabs.sendMessage(tabId, { + type: "computer:mouse_move", + coordinate, + }); +} + +export async function left_click(tabId: number, coordinate?: [number, number]) { + if (!coordinate) { + coordinate = (await cursor_position(tabId)).coordinate; + } + return await chrome.tabs.sendMessage(tabId, { + type: "computer:left_click", + coordinate, + }); +} + +export async function left_click_drag( + tabId: number, + coordinate: [number, number] +) { + let from_coordinate = (await cursor_position(tabId)).coordinate; + return await chrome.tabs.sendMessage(tabId, { + type: "computer:left_click_drag", + from_coordinate, + to_coordinate: coordinate, + }); +} + +export async function right_click( + tabId: number, + coordinate?: [number, number] +) { + if (!coordinate) { + coordinate = (await cursor_position(tabId)).coordinate; + } + return await chrome.tabs.sendMessage(tabId, { + type: "computer:right_click", + coordinate, + }); +} + +export async function double_click( + tabId: number, + coordinate?: [number, number] +) { + if (!coordinate) { + coordinate = (await cursor_position(tabId)).coordinate; + } + return await chrome.tabs.sendMessage(tabId, { + type: "computer:double_click", + coordinate, + }); +} + +export async function screenshot(windowId?: number): Promise<{ + image: { + type: "base64"; + media_type: "image/png" | "image/jpeg"; + data: string; + }; +}> { + if (!windowId) { + const window = await chrome.windows.getCurrent(); + windowId = window.id; + } + let dataUrl = await chrome.tabs.captureVisibleTab(windowId as number, { + format: "jpeg", // jpeg / png + quality: 80, // 0-100 + }); + let data = dataUrl.substring(dataUrl.indexOf("base64,") + 7); + return { + image: { + type: "base64", + media_type: dataUrl.indexOf("png") > -1 ? "image/png" : "image/jpeg", + data: data, + }, + }; +} + +export async function scroll_to(tabId: number, coordinate: [number, number]) { + let from_coordinate = (await cursor_position(tabId)).coordinate; + return await chrome.tabs.sendMessage(tabId, { + type: "computer:scroll_to", + from_coordinate, + to_coordinate: coordinate, + }); +} + +export async function cursor_position(tabId: number): Promise<{ + coordinate: [number, number]; +}> { + let result: any = await chrome.tabs.sendMessage(tabId, { + type: "computer:cursor_position", + }); + return { coordinate: result.coordinate as [number, number] }; +} + +export async function size(tabId?: number): Promise<[number, number]> { + return await utils.getPageSize(tabId); +} diff --git a/src/background/tools/computer_use.ts b/src/background/tools/computer_use.ts index af4e3be..f452397 100644 --- a/src/background/tools/computer_use.ts +++ b/src/background/tools/computer_use.ts @@ -2,7 +2,7 @@ import { Tool, InputSchema } from "../../types/action.types"; import * as utils from "../utils"; /** - * Computer Use + * Computer Use for fellou */ export class ComputerUse implements Tool { name: string; @@ -12,6 +12,7 @@ export class ComputerUse implements Tool { tabId?: number; constructor(size: [number, number]) { + // TODO The screenshot is of the screen, but the plugin returns the relative position of the browser, not the screen, there is a problem! this.name = "computer_use"; this.description = `Use a mouse and keyboard to interact with a computer, and take screenshots. * This is a browser GUI interface where you do not have access to the address bar or bookmarks. You must operate the browser using inputs like screenshots, mouse, keyboard, etc. diff --git a/src/background/utils.ts b/src/background/utils.ts index b26e775..e9b6298 100644 --- a/src/background/utils.ts +++ b/src/background/utils.ts @@ -9,8 +9,10 @@ export function getCurrentTabId(): Promise { }); } -export async function getPageSize(): Promise<{ width: number; height: number }> { - let tabId = await getCurrentTabId(); +export async function getPageSize(tabId?: number): Promise<[number, number]> { + if (!tabId) { + tabId = await getCurrentTabId(); + } let injectionResult = await chrome.scripting.executeScript({ target: { tabId: tabId as number }, func: () => [ @@ -22,10 +24,10 @@ export async function getPageSize(): Promise<{ width: number; height: number }> document.body.clientHeight, ], }); - return { - width: injectionResult[0].result[0] as number, - height: injectionResult[0].result[1] as number, - }; + return [ + injectionResult[0].result[0] as number, + injectionResult[0].result[1] as number + ]; } export function sleep(time: number): Promise { diff --git a/src/content/index.ts b/src/content/index.ts index 653f12c..fa5ff34 100644 --- a/src/content/index.ts +++ b/src/content/index.ts @@ -1,5 +1,14 @@ declare const eko: any; +if (!(window as any).eko) { + (window as any).eko = { lastMouseX: 0, lastMouseY: 0 }; +} + +document.addEventListener("mousemove", (event) => { + eko.lastMouseX = event.clientX; + eko.lastMouseY = event.clientY; +}); + chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) { (async () => { try { @@ -14,6 +23,57 @@ chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) { sendResponse(result); break; } + case "computer:key": { + sendResponse(key(request)); + break; + } + case "computer:type": { + sendResponse(type(request)); + break; + } + case "computer:mouse_move": { + sendResponse(mouse_move(request)); + break; + } + case "computer:left_click": { + simulateMouseEvent(request, ["mousedown", "mouseup", "click"], 0); + sendResponse(); + break; + } + case "computer:right_click": { + simulateMouseEvent( + request, + ["mousedown", "mouseup", "contextmenu"], + 2 + ); + sendResponse(); + break; + } + case "computer:double_click": { + simulateMouseEvent( + request, + [ + "mousedown", + "mouseup", + "click", + "mousedown", + "mouseup", + "click", + "dblclick", + ], + 0 + ); + sendResponse(); + break; + } + case "computer:left_click_drag": { + sendResponse(left_click_drag(request)); + break; + } + case "computer:scroll_to": { + sendResponse(scroll_to(request)); + break; + } case "computer:cursor_position": { sendResponse({ coordinate: [eko.lastMouseX, eko.lastMouseY] }); break; @@ -26,7 +86,133 @@ chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) { return true; }); -document.addEventListener("mousemove", (event) => { - eko.lastMouseX = event.clientX; - eko.lastMouseY = event.clientY; -}); +function key(request: any) { + const event = new KeyboardEvent(request.keyEventType || "keydown", { + key: request.key, + ctrlKey: request.ctrlKey, + altKey: request.altKey, + shiftKey: request.shiftKey, + metaKey: request.metaKey, + bubbles: true, + cancelable: true, + }); + let coordinate = request.coordinate as [number, number]; + ( + document.activeElement || + document.elementFromPoint(coordinate[0], coordinate[1]) + )?.dispatchEvent(event); +} + +function type(request: any) { + let text = request.text as string; + let coordinate = request.coordinate as [number, number]; + let element = document.elementFromPoint(coordinate[0], coordinate[1]); + if (!element) { + return; + } + let input: any; + if ( + element.tagName == "INPUT" || + element.tagName == "TEXTAREA" || + element.childElementCount == 0 + ) { + input = element; + } else { + input = + element.querySelector("input") || + element.querySelector("textarea") || + element; + } + input.focus && input.focus(); + input.value += text; + input.dispatchEvent(new Event("input", { bubbles: true })); +} + +function mouse_move(request: any) { + let coordinate = request.coordinate as [number, number]; + let x = coordinate[0]; + let y = coordinate[1]; + const event = new MouseEvent("mousemove", { + view: window, + bubbles: true, + cancelable: true, + screenX: x, + screenY: y, + clientX: x, + clientY: y, + }); + return document.body.dispatchEvent(event); +} + +function simulateMouseEvent( + request: any, + eventTypes: Array, + button: 0 | 1 | 2 +) { + const coordinate = request.coordinate as [number, number]; + const x = coordinate[0]; + const y = coordinate[1]; + const element = document.elementFromPoint(x, y) || document.body; + for (let i = 0; i < eventTypes.length; i++) { + const event = new MouseEvent(eventTypes[i], { + view: window, + bubbles: true, + cancelable: true, + clientX: x, + clientY: y, + button, // 0 left; 2 right + }); + element.dispatchEvent(event); + } +} + +function scroll_to(request: any) { + // const from_coordinate = request.from_coordinate as [number, number]; + const to_coordinate = request.to_coordinate as [number, number]; + window.scrollTo({ + top: to_coordinate[0], + left: to_coordinate[1], + behavior: "smooth", + }); +} + +function left_click_drag(request: any, steps = 10) { + const from_coordinate = request.from_coordinate as [number, number]; + const to_coordinate = request.to_coordinate as [number, number]; + let startX = from_coordinate[0]; + let startY = from_coordinate[1]; + let endX = to_coordinate[0]; + let endY = to_coordinate[1]; + let element = document.elementFromPoint(startX, startY) || document.body; + const mouseDownEvent = new MouseEvent("mousedown", { + bubbles: true, + cancelable: true, + view: window, + clientX: startX, + clientY: startY, + button: 0, + }); + element.dispatchEvent(mouseDownEvent); + for (let i = 1; i <= steps; i++) { + const intermediateX = startX + (endX - startX) * (i / steps); + const intermediateY = startY + (endY - startY) * (i / steps); + const dragEvent = new MouseEvent("mousemove", { + bubbles: true, + cancelable: true, + view: window, + clientX: intermediateX, + clientY: intermediateY, + button: 0, + }); + element.dispatchEvent(dragEvent); + } + const mouseUpEvent = new MouseEvent("mouseup", { + bubbles: true, + cancelable: true, + view: window, + clientX: endX, + clientY: endY, + button: 0, + }); + element.dispatchEvent(mouseUpEvent); +} diff --git a/src/script/common.js b/src/script/common.js index d5c504f..4004060 100755 --- a/src/script/common.js +++ b/src/script/common.js @@ -1,12 +1,12 @@ /** - * 通用 JS 函数 + * Common JS function */ if (!window.eko) { window.eko = {} } /** - * 提取网页内容 + * Extract html content */ eko.extractHtmlContent = function (element) { element = element || document.body @@ -37,10 +37,10 @@ eko.extractHtmlContent = function (element) { } /** - * 元素文本(去除连续空白和换行) + * Element text (remove consecutive spaces and line breaks) * - * @param {HTMLElement|string} object 元素/字符串 - * @returns 文本 + * @param {HTMLElement|string} object + * @returns text */ eko.cleanText = function(object) { let str = (typeof object == 'string') ? object : object?.innerText @@ -48,16 +48,16 @@ eko.cleanText = function(object) { } /** - * 睡眠 + * sleep * - * @param {number} time 毫秒 + * @param {number} time millisecond */ eko.sleep = function(time) { return new Promise(resolve => setTimeout(() => resolve(), time)) } /** - * 元素是否可见 + * element displayed * * @param {HTMLElement} element */ @@ -66,9 +66,9 @@ eko.isDisplayed = function (element) { } /** - * 点击 + * click * - * @param {HTMLElement} element 元素 + * @param {HTMLElement} element */ eko.click = function(element) { if (element.click) { @@ -83,9 +83,10 @@ eko.click = function(element) { } /** - * 触发模拟输入 + * Trigger simulated input */ eko.sendKeys = function(element, str, clear, keypress) { + element.focus && element.focus() if (clear) { for (let i = 0; i < element.value.length; i++) { element.dispatchEvent(new KeyboardEvent('keydown', { key: 'Backspace' })) @@ -105,7 +106,7 @@ eko.sendKeys = function(element, str, clear, keypress) { } /** - * 等待Dom改变 + * Waiting for Dom to change */ eko.waitForDomChanged = function (targetElement, fun, timeout, config, firstExecute) { targetElement = targetElement || document.body @@ -139,7 +140,7 @@ eko.waitForDomChanged = function (targetElement, fun, timeout, config, firstExec } /** - * 等待页面加载完成 onload 后 + * Wait for the page to finish loading after onload */ eko.waitLoaded = async function() { await eko.waitForDomChanged(document.body, () => document.readyState == 'complete', 5000, {}, true) @@ -147,7 +148,7 @@ eko.waitLoaded = async function() { } /** - * 等待元素出现 + * Wait for the element to present */ eko.waitForElementPresent = function (targetElement, cssSelector, timeout) { targetElement = targetElement || document.body @@ -155,7 +156,7 @@ eko.waitForElementPresent = function (targetElement, cssSelector, timeout) { } /** - * 等待元素可见 + * Wait for the element to displayed */ eko.waitForElementDisplayed = function (targetElement, cssSelector, timeout) { targetElement = targetElement || document.body @@ -175,7 +176,7 @@ eko.waitForElementDisplayed = function (targetElement, cssSelector, timeout) { } /** - * 等待元素消失 + * Wait for the element to present */ eko.waitForElementNotPresent = function (targetElement, cssSelector, timeout) { targetElement = targetElement || document.body @@ -183,7 +184,7 @@ eko.waitForElementNotPresent = function (targetElement, cssSelector, timeout) { } /** - * 等待元素不可见 + * Waiting for element to be invisible */ eko.waitForElementNotDisplayed = function (targetElement, cssSelector, timeout) { targetElement = targetElement || document.body