SilkMoth/docu/demo_example.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c9f89a47",
   "metadata": {},
   "source": [
    "## SilkMoth Demo"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ca15800",
   "metadata": {},
   "source": [
    "### Related Set Discovery task under Set‑Containment using Jaccard Similarity"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea6ce5fb",
   "metadata": {},
   "source": [
    "Import of all required modules:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "bdd1b92c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"src\")\n",
    "\n",
    "from silkmoth.tokenizer import Tokenizer\n",
    "from silkmoth.inverted_index import InvertedIndex\n",
    "from silkmoth.signature_generator import SignatureGenerator\n",
    "from silkmoth.candidate_selector import CandidateSelector\n",
    "from silkmoth.verifier import Verifier\n",
    "from silkmoth.silkmoth_engine import SilkMothEngine\n",
    "\n",
    "\n",
    "from silkmoth.utils import jaccard_similarity, contain, edit_similarity, similar, SigType\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from IPython.display import display, Markdown\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf6bf1f5",
   "metadata": {},
   "source": [
    "Define example related dataset from \"SilkMoth\" paper (reference set **R** and source sets **S**)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "598a4bbf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**Reference set (R):**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- R[0]: “77 Mass Ave Boston MA”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- R[1]: “5th St 02115 Seattle WA”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- R[2]: “77 5th St Chicago IL”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "**Source sets (S):**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Location Dataset\n",
    "reference_set = [\n",
    "    '77 Mass Ave Boston MA',\n",
    "    '5th St 02115 Seattle WA',\n",
    "    '77 5th St Chicago IL'\n",
    "]\n",
    "\n",
    "# Address Dataset\n",
    "source_sets = [\n",
    "    ['Mass Ave St Boston 02115','77 Mass 5th St Boston','77 Mass Ave 5th 02115'],\n",
    "    ['77 Boston MA','77 5th St Boston 02115','77 Mass Ave 02115 Seattle'],\n",
    "    ['77 Mass Ave 5th Boston MA','Mass Ave Chicago IL','77 Mass Ave St'],\n",
    "    ['77 Mass Ave MA','5th St 02115 Seattle WA','77 5th St Boston Seattle']\n",
    "]\n",
    "\n",
    "# thresholds & q\n",
    "δ = 0.7\n",
    "α = 0.0\n",
    "q = 3\n",
    "\n",
    "display(Markdown(\"**Reference set (R):**\"))\n",
    "for i, r in enumerate(reference_set):\n",
    "    display(Markdown(f\"- R[{i}]: “{r}”\"))\n",
    "display(Markdown(\"**Source sets (S):**\"))\n",
    "for j, S in enumerate(source_sets):\n",
    "    display(Markdown(f\"- S[{j}]: “{' | '.join(S)}”\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a50b350a",
   "metadata": {},
   "source": [
    "### 1. Tokenization\n",
    "Tokenize each element of R and each S using Jaccard Similarity (whitespace tokens)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "55e7b5d0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**Tokenized Reference set (R):**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of R[0]: {'Ave', 'MA', '77', 'Boston', 'Mass'}"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of R[1]: {'5th', 'Seattle', 'St', 'WA', '02115'}"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of R[2]: {'77', '5th', 'IL', 'St', 'Chicago'}"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "**Tokenized Source sets (S):**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of S[0]: [{'Ave', 'Boston', 'St', 'Mass', '02115'}, {'77', 'Boston', '5th', 'St', 'Mass'}, {'Ave', '77', '5th', 'Mass', '02115'}]"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of S[1]: [{'Boston', 'MA', '77'}, {'77', 'Boston', '5th', 'St', '02115'}, {'Ave', '77', 'Seattle', 'Mass', '02115'}]"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of S[2]: [{'Ave', 'MA', '77', 'Boston', '5th', 'Mass'}, {'IL', 'Ave', 'Mass', 'Chicago'}, {'St', 'Ave', 'Mass', '77'}]"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Tokens of S[3]: [{'Ave', 'Mass', '77', 'MA'}, {'5th', 'Seattle', 'St', 'WA', '02115'}, {'77', 'Boston', '5th', 'Seattle', 'St'}]"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenizer = Tokenizer(jaccard_similarity, q)\n",
    "tokenized_R = tokenizer.tokenize(reference_set)\n",
    "tokenized_S = [tokenizer.tokenize(S) for S in source_sets]\n",
    "\n",
    "display(Markdown(\"**Tokenized Reference set (R):**\"))\n",
    "for i, toks in enumerate(tokenized_R):\n",
    "    display(Markdown(f\"- Tokens of R[{i}]: {toks}\"))\n",
    "\n",
    "display(Markdown(\"**Tokenized Source sets (S):**\"))\n",
    "for i, toks in enumerate(tokenized_S):\n",
    "    display(Markdown(f\"- Tokens of S[{i}]: {toks}\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e17b807b",
   "metadata": {},
   "source": [
    "### 2. Build Inverted Index\n",
    "Builds an inverted index on the tokenized source sets and shows an example lookup."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "22c7d1d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "- Index built over 4 source sets."
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- Example: token “Mass” appears in [(0, 0), (0, 1), (0, 2), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0)]"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "index = InvertedIndex(tokenized_S)\n",
    "display(Markdown(f\"- Index built over {len(source_sets)} source sets.\"))\n",
    "display(Markdown(f\"- Example: token “Mass” appears in {index.get_indexes('Mass')}\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc17daac",
   "metadata": {},
   "source": [
    "### 3. Signature Generation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c48bac2",
   "metadata": {},
   "source": [
    "Generates the weighted signature for R given δ, α (here α=0), using Jaccard Similarity."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a36be65c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "- Selected signature tokens: **['Chicago', 'WA', 'IL', '5th']**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sig_gen = SignatureGenerator()\n",
    "signature = sig_gen.get_signature(\n",
    "    tokenized_R, index,\n",
    "    delta=δ, alpha=α,\n",
    "    sig_type=SigType.WEIGHTED,\n",
    "    sim_fun=jaccard_similarity,\n",
    "    q=q\n",
    ")\n",
    "display(Markdown(f\"- Selected signature tokens: **{signature}**\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "938be3e2",
   "metadata": {},
   "source": [
    "### 4. Initial Candidate Selection\n",
    "\n",
    "Looks up each signature token in the inverted index to form the candidate set.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "58017e27",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "- Candidate set indices: **[0, 1, 2, 3]**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  - S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  - S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  - S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cand_sel = CandidateSelector(\n",
    "    similarity_func=jaccard_similarity,\n",
    "    sim_metric=contain,\n",
    "    related_thresh=δ,\n",
    "    sim_thresh=α,\n",
    "    q=q\n",
    ")\n",
    "\n",
    "initial_cands = cand_sel.get_candidates(signature, index, len(tokenized_R))\n",
    "display(Markdown(f\"- Candidate set indices: **{sorted(initial_cands)}**\"))\n",
    "for j in sorted(initial_cands):\n",
    "    display(Markdown(f\"  - S[{j}]: “{' | '.join(source_sets[j])}”\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d633e5f9",
   "metadata": {},
   "source": [
    "### 5. Check Filter\n",
    "Prunes candidates by ensuring each matched element passes the local similarity bound.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "9a2bfdeb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**Surviving after check filter:** **[0, 1, 3]**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "S[0] matched:"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "     • R[2] “77 5th St Chicago IL” → sim = 0.429"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  → Best sim: **0.429** | Matched elements: **1**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "S[1] matched:"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "     • R[2] “77 5th St Chicago IL” → sim = 0.429"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  → Best sim: **0.429** | Matched elements: **1**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "S[3] matched:"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "     • R[1] “5th St 02115 Seattle WA” → sim = 1.000"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "     • R[2] “77 5th St Chicago IL” → sim = 0.429"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  → Best sim: **1.000** | Matched elements: **2**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "filtered_cands, match_map = cand_sel.check_filter(\n",
    "    tokenized_R, set(signature), initial_cands, index\n",
    ")\n",
    "display(Markdown(f\"**Surviving after check filter:** **{sorted(filtered_cands)}**\"))\n",
    "for j in sorted(filtered_cands):\n",
    "    display(Markdown(f\"S[{j}] matched:\"))\n",
    "    for r_idx, sim in match_map[j].items():\n",
    "        sim_text = f\"{sim:.3f}\"\n",
    "        display(Markdown(f\"     • R[{r_idx}] “{reference_set[r_idx]}” → sim = {sim_text}\"))\n",
    "    \n",
    "    matches = match_map.get(j, {})\n",
    "    if matches:\n",
    "        best_sim = max(matches.values())\n",
    "        num_matches = len(matches)\n",
    "        display(Markdown(f\"  → Best sim: **{best_sim:.3f}** | Matched elements: **{num_matches}**\"))\n",
    "    else:\n",
    "        display(Markdown(f\"No elements passed similarity checks.\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc37bb7f",
   "metadata": {},
   "source": [
    "### 6. Nearest‑Neighbor Filter\n",
    "\n",
    "Further prunes via nearest‑neighbor upper bounds on total matching score.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "aa9b7a63",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "- Surviving after NN filter: **[3]**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "nn_filtered = cand_sel.nn_filter(\n",
    "    tokenized_R, set(signature), filtered_cands,\n",
    "    index, threshold=δ, match_map=match_map\n",
    ")\n",
    "display(Markdown(f\"- Surviving after NN filter: **{sorted(nn_filtered)}**\"))\n",
    "for j in nn_filtered:\n",
    "    display(Markdown(f\"  - S[{j}]: “{' | '.join(source_sets[j])}”\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8638f83a",
   "metadata": {},
   "source": [
    "### 7. Verification\n",
    "\n",
    "Runs the bipartite max‑matching on the remaining candidates and outputs the final related sets.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "ebdf20fe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Final related sets (score ≥ 0.7):"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "  • S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle” → **0.743**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "verifier = Verifier(δ, contain, jaccard_similarity, sim_thresh=α, reduction=False)\n",
    "results = verifier.get_related_sets(tokenized_R, nn_filtered, index)\n",
    "\n",
    "if results:\n",
    "    display(Markdown(f\"Final related sets (score ≥ {δ}):\"))\n",
    "    for j, score in results:\n",
    "        display(Markdown(f\"  • S[{j}]: “{' | '.join(source_sets[j])}” → **{score:.3f}**\"))\n",
    "else:\n",
    "    display(Markdown(\"- No sets passed verification.\"))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "silkmoth_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}