This commit is contained in:
Andreas Wilms
2025-09-08 19:05:42 +02:00
commit d85c1c86df
153 changed files with 140246 additions and 0 deletions

823
demo_example.ipynb Normal file
View File

@@ -0,0 +1,823 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "c9f89a47",
"metadata": {},
"source": [
"## SilkMoth Demo"
]
},
{
"cell_type": "markdown",
"id": "2ca15800",
"metadata": {},
"source": [
"### Related Set Discovery task under SetContainment using Jaccard Similarity"
]
},
{
"cell_type": "markdown",
"id": "ea6ce5fb",
"metadata": {},
"source": [
"Import of all required modules:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "bdd1b92c",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"src\")\n",
"\n",
"from silkmoth.tokenizer import Tokenizer\n",
"from silkmoth.inverted_index import InvertedIndex\n",
"from silkmoth.signature_generator import SignatureGenerator\n",
"from silkmoth.candidate_selector import CandidateSelector\n",
"from silkmoth.verifier import Verifier\n",
"from silkmoth.silkmoth_engine import SilkMothEngine\n",
"\n",
"\n",
"from silkmoth.utils import jaccard_similarity, contain, edit_similarity, similar, SigType\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from IPython.display import display, Markdown\n",
"\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "bf6bf1f5",
"metadata": {},
"source": [
"Define example related dataset from \"SilkMoth\" paper (reference set **R** and source sets **S**)\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "598a4bbf",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"**Reference set (R):**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- R[0]: “77 Mass Ave Boston MA”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- R[1]: “5th St 02115 Seattle WA”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- R[2]: “77 5th St Chicago IL”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**Source sets (S):**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Location Dataset\n",
"reference_set = [\n",
" '77 Mass Ave Boston MA',\n",
" '5th St 02115 Seattle WA',\n",
" '77 5th St Chicago IL'\n",
"]\n",
"\n",
"# Address Dataset\n",
"source_sets = [\n",
" ['Mass Ave St Boston 02115','77 Mass 5th St Boston','77 Mass Ave 5th 02115'],\n",
" ['77 Boston MA','77 5th St Boston 02115','77 Mass Ave 02115 Seattle'],\n",
" ['77 Mass Ave 5th Boston MA','Mass Ave Chicago IL','77 Mass Ave St'],\n",
" ['77 Mass Ave MA','5th St 02115 Seattle WA','77 5th St Boston Seattle']\n",
"]\n",
"\n",
"# thresholds & q\n",
"δ = 0.7\n",
"α = 0.0\n",
"q = 3\n",
"\n",
"display(Markdown(\"**Reference set (R):**\"))\n",
"for i, r in enumerate(reference_set):\n",
" display(Markdown(f\"- R[{i}]: “{r}”\"))\n",
"display(Markdown(\"**Source sets (S):**\"))\n",
"for j, S in enumerate(source_sets):\n",
" display(Markdown(f\"- S[{j}]: “{' | '.join(S)}”\"))"
]
},
{
"cell_type": "markdown",
"id": "a50b350a",
"metadata": {},
"source": [
"### 1. Tokenization\n",
"Tokenize each element of R and each S using Jaccard Similarity (whitespace tokens)\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "55e7b5d0",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"**Tokenized Reference set (R):**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of R[0]: {'Ave', 'MA', '77', 'Boston', 'Mass'}"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of R[1]: {'5th', 'Seattle', 'St', 'WA', '02115'}"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of R[2]: {'77', '5th', 'IL', 'St', 'Chicago'}"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**Tokenized Source sets (S):**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of S[0]: [{'Ave', 'Boston', 'St', 'Mass', '02115'}, {'77', 'Boston', '5th', 'St', 'Mass'}, {'Ave', '77', '5th', 'Mass', '02115'}]"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of S[1]: [{'Boston', 'MA', '77'}, {'77', 'Boston', '5th', 'St', '02115'}, {'Ave', '77', 'Seattle', 'Mass', '02115'}]"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of S[2]: [{'Ave', 'MA', '77', 'Boston', '5th', 'Mass'}, {'IL', 'Ave', 'Mass', 'Chicago'}, {'St', 'Ave', 'Mass', '77'}]"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Tokens of S[3]: [{'Ave', 'Mass', '77', 'MA'}, {'5th', 'Seattle', 'St', 'WA', '02115'}, {'77', 'Boston', '5th', 'Seattle', 'St'}]"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tokenizer = Tokenizer(jaccard_similarity, q)\n",
"tokenized_R = tokenizer.tokenize(reference_set)\n",
"tokenized_S = [tokenizer.tokenize(S) for S in source_sets]\n",
"\n",
"display(Markdown(\"**Tokenized Reference set (R):**\"))\n",
"for i, toks in enumerate(tokenized_R):\n",
" display(Markdown(f\"- Tokens of R[{i}]: {toks}\"))\n",
"\n",
"display(Markdown(\"**Tokenized Source sets (S):**\"))\n",
"for i, toks in enumerate(tokenized_S):\n",
" display(Markdown(f\"- Tokens of S[{i}]: {toks}\"))"
]
},
{
"cell_type": "markdown",
"id": "e17b807b",
"metadata": {},
"source": [
"### 2. Build Inverted Index\n",
"Builds an inverted index on the tokenized source sets and shows an example lookup."
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "22c7d1d6",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"- Index built over 4 source sets."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"- Example: token “Mass” appears in [(0, 0), (0, 1), (0, 2), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0)]"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"index = InvertedIndex(tokenized_S)\n",
"display(Markdown(f\"- Index built over {len(source_sets)} source sets.\"))\n",
"display(Markdown(f\"- Example: token “Mass” appears in {index.get_indexes('Mass')}\"))\n"
]
},
{
"cell_type": "markdown",
"id": "cc17daac",
"metadata": {},
"source": [
"### 3. Signature Generation"
]
},
{
"cell_type": "markdown",
"id": "1c48bac2",
"metadata": {},
"source": [
"Generates the weighted signature for R given δ, α (here α=0), using Jaccard Similarity."
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "a36be65c",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"- Selected signature tokens: **['Chicago', 'WA', 'IL', '5th']**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sig_gen = SignatureGenerator()\n",
"signature = sig_gen.get_signature(\n",
" tokenized_R, index,\n",
" delta=δ, alpha=α,\n",
" sig_type=SigType.WEIGHTED,\n",
" sim_fun=jaccard_similarity,\n",
" q=q\n",
")\n",
"display(Markdown(f\"- Selected signature tokens: **{signature}**\"))"
]
},
{
"cell_type": "markdown",
"id": "938be3e2",
"metadata": {},
"source": [
"### 4. Initial Candidate Selection\n",
"\n",
"Looks up each signature token in the inverted index to form the candidate set.\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "58017e27",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"- Candidate set indices: **[0, 1, 2, 3]**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" - S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" - S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" - S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cand_sel = CandidateSelector(\n",
" similarity_func=jaccard_similarity,\n",
" sim_metric=contain,\n",
" related_thresh=δ,\n",
" sim_thresh=α,\n",
" q=q\n",
")\n",
"\n",
"initial_cands = cand_sel.get_candidates(signature, index, len(tokenized_R))\n",
"display(Markdown(f\"- Candidate set indices: **{sorted(initial_cands)}**\"))\n",
"for j in sorted(initial_cands):\n",
" display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))"
]
},
{
"cell_type": "markdown",
"id": "d633e5f9",
"metadata": {},
"source": [
"### 5. Check Filter\n",
"Prunes candidates by ensuring each matched element passes the local similarity bound.\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "9a2bfdeb",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"**Surviving after check filter:** **[0, 1, 3]**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"S[0] matched:"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" → Best sim: **0.429** | Matched elements: **1**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"S[1] matched:"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" → Best sim: **0.429** | Matched elements: **1**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"S[3] matched:"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" • R[1] “5th St 02115 Seattle WA” → sim = 1.000"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" → Best sim: **1.000** | Matched elements: **2**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"filtered_cands, match_map = cand_sel.check_filter(\n",
" tokenized_R, set(signature), initial_cands, index\n",
")\n",
"display(Markdown(f\"**Surviving after check filter:** **{sorted(filtered_cands)}**\"))\n",
"for j in sorted(filtered_cands):\n",
" display(Markdown(f\"S[{j}] matched:\"))\n",
" for r_idx, sim in match_map[j].items():\n",
" sim_text = f\"{sim:.3f}\"\n",
" display(Markdown(f\" • R[{r_idx}] “{reference_set[r_idx]}” → sim = {sim_text}\"))\n",
" \n",
" matches = match_map.get(j, {})\n",
" if matches:\n",
" best_sim = max(matches.values())\n",
" num_matches = len(matches)\n",
" display(Markdown(f\" → Best sim: **{best_sim:.3f}** | Matched elements: **{num_matches}**\"))\n",
" else:\n",
" display(Markdown(f\"No elements passed similarity checks.\"))\n"
]
},
{
"cell_type": "markdown",
"id": "cc37bb7f",
"metadata": {},
"source": [
"### 6. NearestNeighbor Filter\n",
"\n",
"Further prunes via nearestneighbor upper bounds on total matching score.\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "aa9b7a63",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"- Surviving after NN filter: **[3]**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"nn_filtered = cand_sel.nn_filter(\n",
" tokenized_R, set(signature), filtered_cands,\n",
" index, threshold=δ, match_map=match_map\n",
")\n",
"display(Markdown(f\"- Surviving after NN filter: **{sorted(nn_filtered)}**\"))\n",
"for j in nn_filtered:\n",
" display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))\n"
]
},
{
"cell_type": "markdown",
"id": "8638f83a",
"metadata": {},
"source": [
"### 7. Verification\n",
"\n",
"Runs the bipartite maxmatching on the remaining candidates and outputs the final related sets.\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ebdf20fe",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Final related sets (score ≥ 0.7):"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
" • S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle” → **0.743**"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"verifier = Verifier(δ, contain, jaccard_similarity, sim_thresh=α, reduction=False)\n",
"results = verifier.get_related_sets(tokenized_R, nn_filtered, index)\n",
"\n",
"if results:\n",
" display(Markdown(f\"Final related sets (score ≥ {δ}):\"))\n",
" for j, score in results:\n",
" display(Markdown(f\" • S[{j}]: “{' | '.join(source_sets[j])}” → **{score:.3f}**\"))\n",
"else:\n",
" display(Markdown(\"- No sets passed verification.\"))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "silkmoth_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}