{ "cells": [ { "cell_type": "markdown", "id": "c9f89a47", "metadata": {}, "source": [ "## SilkMoth Demo" ] }, { "cell_type": "markdown", "id": "2ca15800", "metadata": {}, "source": [ "### Related Set Discovery task under Set‑Containment using Jaccard Similarity" ] }, { "cell_type": "markdown", "id": "ea6ce5fb", "metadata": {}, "source": [ "Import of all required modules:" ] }, { "cell_type": "code", "execution_count": 24, "id": "bdd1b92c", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"src\")\n", "\n", "from silkmoth.tokenizer import Tokenizer\n", "from silkmoth.inverted_index import InvertedIndex\n", "from silkmoth.signature_generator import SignatureGenerator\n", "from silkmoth.candidate_selector import CandidateSelector\n", "from silkmoth.verifier import Verifier\n", "from silkmoth.silkmoth_engine import SilkMothEngine\n", "\n", "\n", "from silkmoth.utils import jaccard_similarity, contain, edit_similarity, similar, SigType\n", "\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display, Markdown\n", "\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "id": "bf6bf1f5", "metadata": {}, "source": [ "Define example related dataset from \"SilkMoth\" paper (reference set **R** and source sets **S**)\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "598a4bbf", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "**Reference set (R):**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- R[0]: “77 Mass Ave Boston MA”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- R[1]: “5th St 02115 Seattle WA”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- R[2]: “77 5th St Chicago IL”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "**Source sets (S):**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Location Dataset\n", "reference_set = [\n", " '77 Mass Ave Boston MA',\n", " '5th St 02115 Seattle WA',\n", " '77 5th St Chicago IL'\n", "]\n", "\n", "# Address Dataset\n", "source_sets = [\n", " ['Mass Ave St Boston 02115','77 Mass 5th St Boston','77 Mass Ave 5th 02115'],\n", " ['77 Boston MA','77 5th St Boston 02115','77 Mass Ave 02115 Seattle'],\n", " ['77 Mass Ave 5th Boston MA','Mass Ave Chicago IL','77 Mass Ave St'],\n", " ['77 Mass Ave MA','5th St 02115 Seattle WA','77 5th St Boston Seattle']\n", "]\n", "\n", "# thresholds & q\n", "δ = 0.7\n", "α = 0.0\n", "q = 3\n", "\n", "display(Markdown(\"**Reference set (R):**\"))\n", "for i, r in enumerate(reference_set):\n", " display(Markdown(f\"- R[{i}]: “{r}”\"))\n", "display(Markdown(\"**Source sets (S):**\"))\n", "for j, S in enumerate(source_sets):\n", " display(Markdown(f\"- S[{j}]: “{' | '.join(S)}”\"))" ] }, { "cell_type": "markdown", "id": "a50b350a", "metadata": {}, "source": [ "### 1. Tokenization\n", "Tokenize each element of R and each S using Jaccard Similarity (whitespace tokens)\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "55e7b5d0", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "**Tokenized Reference set (R):**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of R[0]: {'Ave', 'MA', '77', 'Boston', 'Mass'}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of R[1]: {'5th', 'Seattle', 'St', 'WA', '02115'}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of R[2]: {'77', '5th', 'IL', 'St', 'Chicago'}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "**Tokenized Source sets (S):**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of S[0]: [{'Ave', 'Boston', 'St', 'Mass', '02115'}, {'77', 'Boston', '5th', 'St', 'Mass'}, {'Ave', '77', '5th', 'Mass', '02115'}]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of S[1]: [{'Boston', 'MA', '77'}, {'77', 'Boston', '5th', 'St', '02115'}, {'Ave', '77', 'Seattle', 'Mass', '02115'}]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of S[2]: [{'Ave', 'MA', '77', 'Boston', '5th', 'Mass'}, {'IL', 'Ave', 'Mass', 'Chicago'}, {'St', 'Ave', 'Mass', '77'}]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Tokens of S[3]: [{'Ave', 'Mass', '77', 'MA'}, {'5th', 'Seattle', 'St', 'WA', '02115'}, {'77', 'Boston', '5th', 'Seattle', 'St'}]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tokenizer = Tokenizer(jaccard_similarity, q)\n", "tokenized_R = tokenizer.tokenize(reference_set)\n", "tokenized_S = [tokenizer.tokenize(S) for S in source_sets]\n", "\n", "display(Markdown(\"**Tokenized Reference set (R):**\"))\n", "for i, toks in enumerate(tokenized_R):\n", " display(Markdown(f\"- Tokens of R[{i}]: {toks}\"))\n", "\n", "display(Markdown(\"**Tokenized Source sets (S):**\"))\n", "for i, toks in enumerate(tokenized_S):\n", " display(Markdown(f\"- Tokens of S[{i}]: {toks}\"))" ] }, { "cell_type": "markdown", "id": "e17b807b", "metadata": {}, "source": [ "### 2. Build Inverted Index\n", "Builds an inverted index on the tokenized source sets and shows an example lookup." ] }, { "cell_type": "code", "execution_count": 27, "id": "22c7d1d6", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "- Index built over 4 source sets." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "- Example: token “Mass” appears in [(0, 0), (0, 1), (0, 2), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0)]" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "index = InvertedIndex(tokenized_S)\n", "display(Markdown(f\"- Index built over {len(source_sets)} source sets.\"))\n", "display(Markdown(f\"- Example: token “Mass” appears in {index.get_indexes('Mass')}\"))\n" ] }, { "cell_type": "markdown", "id": "cc17daac", "metadata": {}, "source": [ "### 3. Signature Generation" ] }, { "cell_type": "markdown", "id": "1c48bac2", "metadata": {}, "source": [ "Generates the weighted signature for R given δ, α (here α=0), using Jaccard Similarity." ] }, { "cell_type": "code", "execution_count": 28, "id": "a36be65c", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "- Selected signature tokens: **['Chicago', 'WA', 'IL', '5th']**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sig_gen = SignatureGenerator()\n", "signature = sig_gen.get_signature(\n", " tokenized_R, index,\n", " delta=δ, alpha=α,\n", " sig_type=SigType.WEIGHTED,\n", " sim_fun=jaccard_similarity,\n", " q=q\n", ")\n", "display(Markdown(f\"- Selected signature tokens: **{signature}**\"))" ] }, { "cell_type": "markdown", "id": "938be3e2", "metadata": {}, "source": [ "### 4. Initial Candidate Selection\n", "\n", "Looks up each signature token in the inverted index to form the candidate set.\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "58017e27", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "- Candidate set indices: **[0, 1, 2, 3]**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " - S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " - S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " - S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cand_sel = CandidateSelector(\n", " similarity_func=jaccard_similarity,\n", " sim_metric=contain,\n", " related_thresh=δ,\n", " sim_thresh=α,\n", " q=q\n", ")\n", "\n", "initial_cands = cand_sel.get_candidates(signature, index, len(tokenized_R))\n", "display(Markdown(f\"- Candidate set indices: **{sorted(initial_cands)}**\"))\n", "for j in sorted(initial_cands):\n", " display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))" ] }, { "cell_type": "markdown", "id": "d633e5f9", "metadata": {}, "source": [ "### 5. Check Filter\n", "Prunes candidates by ensuring each matched element passes the local similarity bound.\n" ] }, { "cell_type": "code", "execution_count": 30, "id": "9a2bfdeb", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "**Surviving after check filter:** **[0, 1, 3]**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "S[0] matched:" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " • R[2] “77 5th St Chicago IL” → sim = 0.429" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " → Best sim: **0.429** | Matched elements: **1**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "S[1] matched:" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " • R[2] “77 5th St Chicago IL” → sim = 0.429" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " → Best sim: **0.429** | Matched elements: **1**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "S[3] matched:" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " • R[1] “5th St 02115 Seattle WA” → sim = 1.000" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " • R[2] “77 5th St Chicago IL” → sim = 0.429" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " → Best sim: **1.000** | Matched elements: **2**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "filtered_cands, match_map = cand_sel.check_filter(\n", " tokenized_R, set(signature), initial_cands, index\n", ")\n", "display(Markdown(f\"**Surviving after check filter:** **{sorted(filtered_cands)}**\"))\n", "for j in sorted(filtered_cands):\n", " display(Markdown(f\"S[{j}] matched:\"))\n", " for r_idx, sim in match_map[j].items():\n", " sim_text = f\"{sim:.3f}\"\n", " display(Markdown(f\" • R[{r_idx}] “{reference_set[r_idx]}” → sim = {sim_text}\"))\n", " \n", " matches = match_map.get(j, {})\n", " if matches:\n", " best_sim = max(matches.values())\n", " num_matches = len(matches)\n", " display(Markdown(f\" → Best sim: **{best_sim:.3f}** | Matched elements: **{num_matches}**\"))\n", " else:\n", " display(Markdown(f\"No elements passed similarity checks.\"))\n" ] }, { "cell_type": "markdown", "id": "cc37bb7f", "metadata": {}, "source": [ "### 6. Nearest‑Neighbor Filter\n", "\n", "Further prunes via nearest‑neighbor upper bounds on total matching score.\n" ] }, { "cell_type": "code", "execution_count": 31, "id": "aa9b7a63", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "- Surviving after NN filter: **[3]**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nn_filtered = cand_sel.nn_filter(\n", " tokenized_R, set(signature), filtered_cands,\n", " index, threshold=δ, match_map=match_map\n", ")\n", "display(Markdown(f\"- Surviving after NN filter: **{sorted(nn_filtered)}**\"))\n", "for j in nn_filtered:\n", " display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))\n" ] }, { "cell_type": "markdown", "id": "8638f83a", "metadata": {}, "source": [ "### 7. Verification\n", "\n", "Runs the bipartite max‑matching on the remaining candidates and outputs the final related sets.\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "ebdf20fe", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "Final related sets (score ≥ 0.7):" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ " • S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle” → **0.743**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "verifier = Verifier(δ, contain, jaccard_similarity, sim_thresh=α, reduction=False)\n", "results = verifier.get_related_sets(tokenized_R, nn_filtered, index)\n", "\n", "if results:\n", " display(Markdown(f\"Final related sets (score ≥ {δ}):\"))\n", " for j, score in results:\n", " display(Markdown(f\" • S[{j}]: “{' | '.join(source_sets[j])}” → **{score:.3f}**\"))\n", "else:\n", " display(Markdown(\"- No sets passed verification.\"))\n" ] } ], "metadata": { "kernelspec": { "display_name": "silkmoth_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }