Replaced spacy with nltk as tokenizer for shortening prompts

DianaStrauss · DianaStrauss · commit e4a2285dc9af · 2024-08-02T15:14:58.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,8 +35,8 @@ dependencies = [
 	'pypsexec == 0.3.0',
 	'pydantic == 2.8.2',
 	'openai == 1.28.0',
-	'spacy',
-	'BeautifulSoup4'
+	'BeautifulSoup4',
+	'nltk'
 ]
 
 [project.urls]
diff --git a/src/hackingBuddyGPT/usecases/web_api_testing/prompt_engineer.py b/src/hackingBuddyGPT/usecases/web_api_testing/prompt_engineer.py
@@ -1,4 +1,6 @@
-import spacy
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
 from instructor.retry import InstructorRetryException
 
 
@@ -37,16 +39,9 @@ def __init__(self, strategy, llm_handler, history, schemas, response_handler):
         self.endpoint_found_methods = {}
         model_name = "en_core_web_sm"
 
-        # Check if the model is already installed
-        from spacy.util import is_package
-        if not is_package(model_name):
-            print(f"Model '{model_name}' is not installed. Installing now...")
-            spacy.cli.download(model_name)
-
-        # Load the model
-        self.nlp = spacy.load(model_name)
-
-        self.nlp = spacy.load("en_core_web_sm")
+        # Check if the models are already installed
+        nltk.download('punkt')
+        nltk.download('stopwords')
         self._prompt_history = history
         self.prompt = self._prompt_history
         self.previous_prompt = self._prompt_history[self.round]["content"]
@@ -199,20 +194,19 @@ def chain_of_thought(self, doc=False, hint=""):
 
     def token_count(self, text):
         """
-        Counts the number of word tokens in the provided text using spaCy's tokenizer.
-
-        Args:
-            text (str): The input text to tokenize and count.
-
-        Returns:
-            int: The number of tokens in the input text.
-        """
-        # Process the text through spaCy's pipeline
-        doc = self.nlp(text)
-        # Count tokens that aren't punctuation marks
-        tokens = [token for token in doc if not token.is_punct]
-        print(f'TOKENS: {len(tokens)}')
-        return len(tokens)
+            Counts the number of word tokens in the provided text using NLTK's tokenizer.
+
+            Args:
+                text (str): The input text to tokenize and count.
+
+            Returns:
+                int: The number of tokens in the input text.
+            """
+        # Tokenize the text using NLTK
+        tokens = word_tokenize(text)
+        # Filter out punctuation marks
+        words = [token for token in tokens if token.isalnum()]
+        return len(words)
 
 
     def check_prompt(self, previous_prompt, chain_of_thought_steps, max_tokens=900):