Skip to content

Commit

Permalink
Merge pull request #4 from stratosphereips/feat_database-support
Browse files Browse the repository at this point in the history
Add DB support to translate messages automatically with LLM model
  • Loading branch information
verovaleros authored Feb 19, 2024
2 parents 0e0e526 + 001f574 commit 1774374
Show file tree
Hide file tree
Showing 8 changed files with 858 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down Expand Up @@ -131,3 +130,4 @@ dmypy.json


*.swp
config_finetuned.yml
Binary file added assets/sample.sqlite
Binary file not shown.
22 changes: 22 additions & 0 deletions assets/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
CREATE TABLE IF NOT EXISTS translation_parameters (
translation_parameters_id INTEGER PRIMARY KEY,
translation_tool_name TEXT,
translation_tool_commit TEXT,
translation_model TEXT,
translation_config_sha256 TEXT,
translation_config TEXT,
UNIQUE(translation_tool_name, translation_tool_commit, translation_model, translation_config_sha256, translation_config)
);



CREATE TABLE IF NOT EXISTS message_translation (
translation_id INTEGER PRIMARY KEY,
translation_parameters_id INTEGER,
message_id INTEGER,
translation_text TEXT,
translation_timestamp TIMESTAMPTZ(0),
UNIQUE(message_id, translation_parameters_id),
FOREIGN KEY (translation_parameters_id) REFERENCES translation_parameters(translation_parameters_id),
FOREIGN KEY (message_id) REFERENCES messages(message_id)
);
216 changes: 198 additions & 18 deletions hermeneisGPT.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
HermeneisGPT is a tool and framework to translate messages and/or
hermeneisGPT is a tool and framework to translate messages and/or
text from hacktivist channels or forums from Russian to English
using Large Language Models.
"""
Expand All @@ -8,9 +8,20 @@

import argparse
import logging
import os
import yaml
from dotenv import dotenv_values
from openai import OpenAI
from lib.utils import get_current_commit
from lib.utils import get_file_sha256
from lib.utils import get_file_content
from lib.db_utils import get_db_connection
from lib.db_utils import create_tables_from_schema
from lib.db_utils import has_channel_messages
from lib.db_utils import insert_translation_parameters
from lib.db_utils import get_channel_messages
from lib.db_utils import exists_translation_for_message
from lib.db_utils import upsert_message_translation


# Set up logging
Expand All @@ -23,7 +34,7 @@

# Create console handler for logging to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO) # Log INFO and above to the console
console_handler.setLevel(logging.ERROR) # Log INFO and above to the console

# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
Expand Down Expand Up @@ -65,6 +76,121 @@ def load_and_parse_config(yaml_config_path):

return config


def translate_mode_automatic(client, config, args):
"""
Run the LLM translation in automatic mode using a
SQLite database. Translations will be written on
the same DB.
"""
limit=int(args.max_limit)
count=1
translation_tool_name = os.path.basename(__file__)
translation_tool_commit = get_current_commit()
translation_model = config['model']
translation_config_sha256 = get_file_sha256(args.yaml_config)
translation_config = get_file_content(args.yaml_config)
try:
logger.debug("Starting automatic translation")

logger.debug("Connecting to DB: %s", args.sqlite_db)
connection, cursor = get_db_connection(args.sqlite_db)

logger.debug("Creating tables needed for translation using schema: %s", args.sqlite_schema)
create_tables_from_schema(connection, cursor, args.sqlite_schema)

has_messages = has_channel_messages(cursor, args.channel_name)
logger.debug("Checking if there are messages for channel %s: %s", args.channel_name, has_messages)

logger.debug("Retrieving translation parameters based on user input")
logger.debug("Retrieving the tool name: %s", translation_tool_name)
logger.debug("Retrieving the tool current commit: %s", translation_tool_commit)
logger.debug("Retrieving the LLM model: %s", translation_model)
logger.debug("Retrieving the YAML config file SHA256: %s", translation_config_sha256)
logger.debug("Retrieving the YAML config file: %s bytes", len(translation_config))

translation_parameters_id = insert_translation_parameters(cursor,
translation_tool_name,
translation_tool_commit,
translation_model,
translation_config_sha256,
translation_config)

logger.debug("Storing translation parameters to DB and retrieving ID: %s", translation_parameters_id)

logger.debug("Retrieving messages for channel: %s", args.channel_name)
channel_messages = get_channel_messages(cursor, args.channel_name)

logger.info("Processing '%s' messages for channel '%s'", len(channel_messages), args.channel_name)
for message_id, message_text in channel_messages:
logger.debug("Processing channel %s message %s (%s bytes)", args.channel_name, message_id, len(message_text))
exists_translation = exists_translation_for_message(cursor, message_id, translation_parameters_id)
if count <= limit:
if not exists_translation:
count = count+1
# There is no translation for this message
logger.debug("Translating message %s with translation parameters ID %s", message_id, translation_parameters_id)

# Translate it with OpenAI model
message_translated = translate(client, config, message_text)

# Update the translation for that row
msg_translation_id = upsert_message_translation(cursor, message_id, translation_parameters_id, message_translated)
logger.debug("Message %s translated with translation ID %s", message_id, msg_translation_id)
else:
# There is a translation for this message
logger.debug("Found translation for message %s with translation parameters ID %s", message_id, translation_parameters_id)
else:
# Translation quota reached
break

logger.info("Finished translating %s messages for %s channel", limit, args.channel_name)
connection.commit()
connection.close()
except KeyboardInterrupt:
connection.commit()
connection.close()
return


def translate(client, config, message):
"""
Run the LLM translation
"""
try:
translate_messages = [{"role":"system", "content": config['system']},
{"role":"user", "content": config['user']+message}]

# Initialize the OpenAI LLM (Language Learning Model)
llm_response = client.chat.completions.create(
model = config['model'],
messages = translate_messages,
max_tokens = config['max_tokens'],
temperature = config['temperature'],
)

return llm_response.choices[0].message.content

except Exception as err:
logger.debug("Exception in translate(): %s", err)


def translate_mode_manual(client, config):
"""
Run the LLM translation in manual interactive mode
"""
try:
logger.debug("Starting manual translation")
while True:
print("Input your message to translate:")
input_lang_ru=input().strip()

message_translated = translate(client, config, input_lang_ru)

print(message_translated)
except KeyboardInterrupt:
return

def main():
"""
Take a message input and use the data from the yaml file to translate
Expand All @@ -79,37 +205,91 @@ def main():
parser = argparse.ArgumentParser(
description='HermeneisGPT: Translate hacking messages from '
'Russian to English using LLMs.')
parser.add_argument('-v',
'--verbose',
action='store_true',
help='run hermeneisGPT in verbose mode')
parser.add_argument('-d',
'--debug',
action='store_true',
help='run hermeneisGPT in debug mode')
parser.add_argument('-c',
'--yaml_config',
default='config_EXAMPLE.yml',
help='Path to the YAML file with challenge data')
help='path to the YAML file with challenge data (default=config_EXAMPLE.yml)')
parser.add_argument('-e',
'--env',
default='.env',
help='Path to environment file (.env)')
help='path to environment file (.env)')
parser.add_argument('-m',
'--mode',
choices=['manual', 'auto-sqlite'],
default='manual',
help='select the mode (manual or auto-sqlite)')

parser.add_argument('--channel_name',
help='name of the hacktivist telegram channel to translate')
parser.add_argument('--max_limit',
default=10,
help='maximum number of messages to translate automatically (default=10)')

parser.add_argument('--sqlite_db',
help='path to SQLite database with messages to translate')
parser.add_argument('--sqlite_schema',
default='assets/schema.sql',
help='path to SQLite database schema for translations')
parser.add_argument('--sqlite_chn_table',
default='channels',
help='DB table where channels are stored (default="channels")')
parser.add_argument('--sqlite_chn_field',
default='channel_name',
help='field on channels table that contains name of the channel (default="channel_name")')
parser.add_argument('--sqlite_msg_table',
default='messages',
help='DB table where messages are stored (default="messages")')
parser.add_argument('--sqlite_msg_field',
default='message_text',
help='field on messages table that contains message text (default="message_text")')
args = parser.parse_args()

if args.verbose:
console_handler.setLevel(logging.INFO)
if args.debug:
console_handler.setLevel(logging.DEBUG)

# Read YAML Configuration file
config = load_and_parse_config(args.yaml_config)

# Set the API key
OPENAI_KEY = set_key(args.env)
client = OpenAI(api_key=OPENAI_KEY)
openai_key = set_key(args.env)
client = OpenAI(api_key=openai_key)

print("Input your message to translate:")
input_lang_ru=input().strip()
# Match the mode to run on
match args.mode:
case "manual":
logger.info("hermeneisGPT on manual mode")

translate_messages = [{"role":"system", "content": config['system']},
{"role":"user", "content": config['user']+input_lang_ru}]
# If a DB is provided, make sure the user knows it will be ignored
if args.sqlite_db:
logger.info("Running on manual mode, ignoring the DB file '%s'", args.sqlite_db)

# Initialize the OpenAI LLM (Language Learning Model)
llm_response = client.chat.completions.create(
model = config['model'],
messages = translate_messages,
max_tokens = config['max_tokens'],
temperature = config['temperature'],
)
print(llm_response.choices[0].message.content)
# Run interactive manual mode
translate_mode_manual(client, config)

case "auto-sqlite":
logger.info("hermeneisGPT on automatic SQLite mode")

# Automatic DB mode requires the database arg to be passes/
if not args.sqlite_db:
logger.error("--sqlite_db is required when running on automatic SQLite mode")
return
# Automatic DB mode requires the hacktivist channel_name to translate messages from
if not args.channel_name:
logger.error("--channel_name is required when running on automatic SQLite mode")
return

# Run automatic mode with sqlite db
translate_mode_automatic(client, config, args)

except Exception as err:
logger.info("Exception in main()")
Expand Down
Empty file added lib/__init__.py
Empty file.
Loading

0 comments on commit 1774374

Please sign in to comment.