Skip to content

Commit e4a2285

Browse files
committed
Replaced spacy with nltk as tokenizer for shortening prompts
1 parent 71e5eb8 commit e4a2285

2 files changed

Lines changed: 21 additions & 27 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ dependencies = [
3535
'pypsexec == 0.3.0',
3636
'pydantic == 2.8.2',
3737
'openai == 1.28.0',
38-
'spacy',
39-
'BeautifulSoup4'
38+
'BeautifulSoup4',
39+
'nltk'
4040
]
4141

4242
[project.urls]

src/hackingBuddyGPT/usecases/web_api_testing/prompt_engineer.py

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
import spacy
1+
import nltk
2+
from nltk.tokenize import word_tokenize
3+
from nltk.corpus import stopwords
24
from instructor.retry import InstructorRetryException
35

46

@@ -37,16 +39,9 @@ def __init__(self, strategy, llm_handler, history, schemas, response_handler):
3739
self.endpoint_found_methods = {}
3840
model_name = "en_core_web_sm"
3941

40-
# Check if the model is already installed
41-
from spacy.util import is_package
42-
if not is_package(model_name):
43-
print(f"Model '{model_name}' is not installed. Installing now...")
44-
spacy.cli.download(model_name)
45-
46-
# Load the model
47-
self.nlp = spacy.load(model_name)
48-
49-
self.nlp = spacy.load("en_core_web_sm")
42+
# Check if the models are already installed
43+
nltk.download('punkt')
44+
nltk.download('stopwords')
5045
self._prompt_history = history
5146
self.prompt = self._prompt_history
5247
self.previous_prompt = self._prompt_history[self.round]["content"]
@@ -199,20 +194,19 @@ def chain_of_thought(self, doc=False, hint=""):
199194

200195
def token_count(self, text):
201196
"""
202-
Counts the number of word tokens in the provided text using spaCy's tokenizer.
203-
204-
Args:
205-
text (str): The input text to tokenize and count.
206-
207-
Returns:
208-
int: The number of tokens in the input text.
209-
"""
210-
# Process the text through spaCy's pipeline
211-
doc = self.nlp(text)
212-
# Count tokens that aren't punctuation marks
213-
tokens = [token for token in doc if not token.is_punct]
214-
print(f'TOKENS: {len(tokens)}')
215-
return len(tokens)
197+
Counts the number of word tokens in the provided text using NLTK's tokenizer.
198+
199+
Args:
200+
text (str): The input text to tokenize and count.
201+
202+
Returns:
203+
int: The number of tokens in the input text.
204+
"""
205+
# Tokenize the text using NLTK
206+
tokens = word_tokenize(text)
207+
# Filter out punctuation marks
208+
words = [token for token in tokens if token.isalnum()]
209+
return len(words)
216210

217211

218212
def check_prompt(self, previous_prompt, chain_of_thought_steps, max_tokens=900):

0 commit comments

Comments
 (0)