Skip to content
Snippets Groups Projects
Verified Commit af8f9b46 authored by Frank Sauerburger's avatar Frank Sauerburger
Browse files

Add q-gram index search

parents
No related branches found
No related tags found
No related merge requests found
Pipeline #9632 passed
__pycache__/
.ipynb_checkpoints/
*.ipynb
stages:
- test
################################################################################
# Unittest
.unittest: &unittest_template
stage: test
script:
- pip install -r requirements.txt
- pip install pytest~=6.2.5
- pytest
unittest:py3.10:
<<: *unittest_template
image: python:3.10
################################################################################
# Lint
.pylint: &pylint_template
stage: test
script:
- pip install -r requirements.txt
- pip install pylint~=2.12.2
- pylint *py
pylint:py3.10:
<<: *pylint_template
image: python:3.10
qgram.py 0 → 100644
# Copyright 2022, Frank Sauerburger
"""Implementation of a q-gram index"""
import collections
class QGramIndex:
"""Q-gram index for fuzzy lookup"""
def __init__(self, q_param=3):
self.q_param = q_param
self._index = collections.defaultdict(set)
def add_term(self, term):
"""Add a single term to the index """
qgrams = self._chunk(term)
for qgram in qgrams:
self._index[qgram].add(term)
def search(self, query):
"""Find vocabulary terms matching the query"""
qgrams = self._chunk(query)
n_query = len(qgrams)
matches = []
for qgram in qgrams:
matches.extend(self._index[qgram])
counter = collections.Counter(matches)
# Filter matches by number of qgram overlap
# See Jaccard coefficient:
# *Introduction to Information Retrieval*
result = []
for term, count in counter.items():
n_term = self._n_qgram(term)
jaccard = count / (n_term + n_query - count)
result.append((jaccard, term))
result.sort(key=lambda x: x[0])
return result
def _n_qgram(self, term):
"""Return the number of qgrams"""
return max(1, len(term) + 3 - self.q_param)
def _chunk(self, term):
"""
Split an input term into a list of qgrams
The beginning and end of the string is denoted by a dollar sign ($).
If the term is too short, the qgrams might be shorter than q.
"""
term = f"${term}$"
qgrams = []
if len(term) < self.q_param:
return [term]
for start in range(len(term) - self.q_param + 1):
end = start + self.q_param
qgrams.append(term[start:end])
return qgrams
"""Test cases for the q-gram index module"""
# pylint: disable=W0212
import unittest
from qgram import QGramIndex
class QGramChunkTest(unittest.TestCase):
"""Test the implementation"""
def test_long(self):
"""Test the chunk method with a long string"""
index = QGramIndex()
qgrams = index._chunk("weather")
self.assertEqual(qgrams, ["$we", "wea", "eat", "ath", "the", "her", "er$"])
index = QGramIndex(q_param=3)
qgrams = index._chunk("hello")
self.assertEqual(qgrams, ["$he", "hel", "ell", "llo", "lo$"])
def test_short(self):
"""Test the chunk method with a short string"""
index = QGramIndex()
qgrams = index._chunk("eat")
self.assertEqual(qgrams, ["$ea", "eat", "at$"])
qgrams = index._chunk("to")
self.assertEqual(qgrams, ["$to", "to$"])
qgrams = index._chunk("a")
self.assertEqual(qgrams, ["$a$"])
def test_empty(self):
"""Test the chunk method with a empty string"""
index = QGramIndex()
qgrams = index._chunk("")
self.assertEqual(qgrams, ["$$"])
def test_q(self):
"""Test the chunk with alternative q's"""
index = QGramIndex(q_param=5)
qgrams = index._chunk("weather")
self.assertEqual(qgrams, ["$weat", "weath", "eathe", "ather", "ther$"])
qgrams = index._chunk("play")
self.assertEqual(qgrams, ["$play", "play$"])
def test_q_short(self):
"""Test the chunk with alternative q's and short strings"""
index = QGramIndex(q_param=5)
qgrams = index._chunk("eat")
self.assertEqual(qgrams, ["$eat$"])
qgrams = index._chunk("a")
self.assertEqual(qgrams, ["$a$"])
def test_q_empty(self):
"""Test the chunk with alternative q's and empty strings"""
index = QGramIndex(q_param=5)
qgrams = index._chunk("")
self.assertEqual(qgrams, ["$$"])
class QGramNQGramsTest(unittest.TestCase):
"""Test the n_qgrams implementation"""
def test_long(self):
"""Test the n_qgrams method with a long string"""
index = QGramIndex()
self.assertEqual(index._n_qgram("weather"), 7)
index = QGramIndex(q_param=3)
self.assertEqual(index._n_qgram("hello"), 5)
def test_short(self):
"""Test the n_qgrams method with a short string"""
index = QGramIndex()
self.assertEqual(index._n_qgram("eat"), 3)
self.assertEqual(index._n_qgram("to"), 2)
self.assertEqual(index._n_qgram("a"), 1)
def test_empty(self):
"""Test the n_qgrams method with a empty string"""
index = QGramIndex()
self.assertEqual(index._n_qgram(""), 1)
def test_q(self):
"""Test the n_qgrams with alternative q's"""
index = QGramIndex(q_param=5)
self.assertEqual(index._n_qgram("weather"), 5)
self.assertEqual(index._n_qgram("play"), 2)
def test_q_short(self):
"""Test the n_qgrams with alternative q's and short strings"""
index = QGramIndex(q_param=5)
self.assertEqual(index._n_qgram("eat"), 1)
self.assertEqual(index._n_qgram("a"), 1)
def test_q_empty(self):
"""Test the n_qgrams with alternative q's and empty strings"""
index = QGramIndex(q_param=5)
self.assertEqual(index._n_qgram(""), 1)
class QGramAddTest(unittest.TestCase):
"""Test the add_term(s) implementations"""
def test_add(self):
"""Check that adding a term populates the internal index"""
index = QGramIndex()
index.add_term("hello")
self.assertEqual(index._index, {
"$he": {"hello"},
"hel": {"hello"},
"ell": {"hello"},
"llo": {"hello"},
"lo$": {"hello"},
})
def test_add_multiple(self):
"""Check that adding multiple terms populates the internal index"""
index = QGramIndex()
index.add_term("rope")
index.add_term("pope")
self.assertEqual(index._index["ope"], {"pope", "rope"})
self.assertEqual(index._index["pe$"], {"pope", "rope"})
self.assertEqual(index._index["rop"], {"rope"})
self.assertEqual(index._index["pop"], {"pope"})
self.assertEqual(index._index["dop"], set())
class QGramSearchTest(unittest.TestCase):
"""Test the search implementations"""
def test_find(self):
"""Check search() finds similar terms"""
index = QGramIndex()
index.add_term("hello")
index.add_term("world")
index.add_term("beer")
index.add_term("moon")
result = index.search("bear")
self.assertEqual(result[0][1], "beer")
self.assertEqual(len(result), 1)
def test_no_find(self):
"""Check search() finds similar terms"""
index = QGramIndex()
index.add_term("hello")
index.add_term("world")
index.add_term("beer")
index.add_term("moon")
index.add_term("deer")
result = index.search("baseball")
self.assertEqual(len(result), 0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment