init

Update README.md
2025-09-08 19:05:42 +02:00
commit 306fce9b53
153 changed files with 140241 additions and 0 deletions
--- a/docu/README.md
+++ b/docu/README.md
@@ -0,0 +1,8 @@
+### Generating Documentation Page
+
+To generate a [documentation page](https://berscjak.github.io/) from source code with mkdocs, run the following from root directory:
+
+```
+pip install mkdocs mkdocstrings[python] mkdocs-awesome-pages-plugin
+mkdocs serve
+```
--- a/docu/demo_example.ipynb
+++ b/docu/demo_example.ipynb
@@ -0,0 +1,823 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c9f89a47",
+   "metadata": {},
+   "source": [
+    "## SilkMoth Demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ca15800",
+   "metadata": {},
+   "source": [
+    "### Related Set Discovery task under Set‑Containment using Jaccard Similarity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea6ce5fb",
+   "metadata": {},
+   "source": [
+    "Import of all required modules:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "bdd1b92c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append(\"src\")\n",
+    "\n",
+    "from silkmoth.tokenizer import Tokenizer\n",
+    "from silkmoth.inverted_index import InvertedIndex\n",
+    "from silkmoth.signature_generator import SignatureGenerator\n",
+    "from silkmoth.candidate_selector import CandidateSelector\n",
+    "from silkmoth.verifier import Verifier\n",
+    "from silkmoth.silkmoth_engine import SilkMothEngine\n",
+    "\n",
+    "\n",
+    "from silkmoth.utils import jaccard_similarity, contain, edit_similarity, similar, SigType\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from IPython.display import display, Markdown\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf6bf1f5",
+   "metadata": {},
+   "source": [
+    "Define example related dataset from \"SilkMoth\" paper (reference set **R** and source sets **S**)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "598a4bbf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**Reference set (R):**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- R[0]: “77 Mass Ave Boston MA”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- R[1]: “5th St 02115 Seattle WA”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- R[2]: “77 5th St Chicago IL”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Source sets (S):**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Location Dataset\n",
+    "reference_set = [\n",
+    "    '77 Mass Ave Boston MA',\n",
+    "    '5th St 02115 Seattle WA',\n",
+    "    '77 5th St Chicago IL'\n",
+    "]\n",
+    "\n",
+    "# Address Dataset\n",
+    "source_sets = [\n",
+    "    ['Mass Ave St Boston 02115','77 Mass 5th St Boston','77 Mass Ave 5th 02115'],\n",
+    "    ['77 Boston MA','77 5th St Boston 02115','77 Mass Ave 02115 Seattle'],\n",
+    "    ['77 Mass Ave 5th Boston MA','Mass Ave Chicago IL','77 Mass Ave St'],\n",
+    "    ['77 Mass Ave MA','5th St 02115 Seattle WA','77 5th St Boston Seattle']\n",
+    "]\n",
+    "\n",
+    "# thresholds & q\n",
+    "δ = 0.7\n",
+    "α = 0.0\n",
+    "q = 3\n",
+    "\n",
+    "display(Markdown(\"**Reference set (R):**\"))\n",
+    "for i, r in enumerate(reference_set):\n",
+    "    display(Markdown(f\"- R[{i}]: “{r}”\"))\n",
+    "display(Markdown(\"**Source sets (S):**\"))\n",
+    "for j, S in enumerate(source_sets):\n",
+    "    display(Markdown(f\"- S[{j}]: “{' | '.join(S)}”\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a50b350a",
+   "metadata": {},
+   "source": [
+    "### 1. Tokenization\n",
+    "Tokenize each element of R and each S using Jaccard Similarity (whitespace tokens)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "55e7b5d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**Tokenized Reference set (R):**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of R[0]: {'Ave', 'MA', '77', 'Boston', 'Mass'}"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of R[1]: {'5th', 'Seattle', 'St', 'WA', '02115'}"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of R[2]: {'77', '5th', 'IL', 'St', 'Chicago'}"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Tokenized Source sets (S):**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of S[0]: [{'Ave', 'Boston', 'St', 'Mass', '02115'}, {'77', 'Boston', '5th', 'St', 'Mass'}, {'Ave', '77', '5th', 'Mass', '02115'}]"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of S[1]: [{'Boston', 'MA', '77'}, {'77', 'Boston', '5th', 'St', '02115'}, {'Ave', '77', 'Seattle', 'Mass', '02115'}]"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of S[2]: [{'Ave', 'MA', '77', 'Boston', '5th', 'Mass'}, {'IL', 'Ave', 'Mass', 'Chicago'}, {'St', 'Ave', 'Mass', '77'}]"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Tokens of S[3]: [{'Ave', 'Mass', '77', 'MA'}, {'5th', 'Seattle', 'St', 'WA', '02115'}, {'77', 'Boston', '5th', 'Seattle', 'St'}]"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "tokenizer = Tokenizer(jaccard_similarity, q)\n",
+    "tokenized_R = tokenizer.tokenize(reference_set)\n",
+    "tokenized_S = [tokenizer.tokenize(S) for S in source_sets]\n",
+    "\n",
+    "display(Markdown(\"**Tokenized Reference set (R):**\"))\n",
+    "for i, toks in enumerate(tokenized_R):\n",
+    "    display(Markdown(f\"- Tokens of R[{i}]: {toks}\"))\n",
+    "\n",
+    "display(Markdown(\"**Tokenized Source sets (S):**\"))\n",
+    "for i, toks in enumerate(tokenized_S):\n",
+    "    display(Markdown(f\"- Tokens of S[{i}]: {toks}\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e17b807b",
+   "metadata": {},
+   "source": [
+    "### 2. Build Inverted Index\n",
+    "Builds an inverted index on the tokenized source sets and shows an example lookup."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "22c7d1d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "- Index built over 4 source sets."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "- Example: token “Mass” appears in [(0, 0), (0, 1), (0, 2), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0)]"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "index = InvertedIndex(tokenized_S)\n",
+    "display(Markdown(f\"- Index built over {len(source_sets)} source sets.\"))\n",
+    "display(Markdown(f\"- Example: token “Mass” appears in {index.get_indexes('Mass')}\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cc17daac",
+   "metadata": {},
+   "source": [
+    "### 3. Signature Generation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c48bac2",
+   "metadata": {},
+   "source": [
+    "Generates the weighted signature for R given δ, α (here α=0), using Jaccard Similarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "a36be65c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "- Selected signature tokens: **['Chicago', 'WA', 'IL', '5th']**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sig_gen = SignatureGenerator()\n",
+    "signature = sig_gen.get_signature(\n",
+    "    tokenized_R, index,\n",
+    "    delta=δ, alpha=α,\n",
+    "    sig_type=SigType.WEIGHTED,\n",
+    "    sim_fun=jaccard_similarity,\n",
+    "    q=q\n",
+    ")\n",
+    "display(Markdown(f\"- Selected signature tokens: **{signature}**\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "938be3e2",
+   "metadata": {},
+   "source": [
+    "### 4. Initial Candidate Selection\n",
+    "\n",
+    "Looks up each signature token in the inverted index to form the candidate set.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "58017e27",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "- Candidate set indices: **[0, 1, 2, 3]**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  - S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  - S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  - S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "cand_sel = CandidateSelector(\n",
+    "    similarity_func=jaccard_similarity,\n",
+    "    sim_metric=contain,\n",
+    "    related_thresh=δ,\n",
+    "    sim_thresh=α,\n",
+    "    q=q\n",
+    ")\n",
+    "\n",
+    "initial_cands = cand_sel.get_candidates(signature, index, len(tokenized_R))\n",
+    "display(Markdown(f\"- Candidate set indices: **{sorted(initial_cands)}**\"))\n",
+    "for j in sorted(initial_cands):\n",
+    "    display(Markdown(f\"  - S[{j}]: “{' | '.join(source_sets[j])}”\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d633e5f9",
+   "metadata": {},
+   "source": [
+    "### 5. Check Filter\n",
+    "Prunes candidates by ensuring each matched element passes the local similarity bound.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "9a2bfdeb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**Surviving after check filter:** **[0, 1, 3]**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "S[0] matched:"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "     • R[2] “77 5th St Chicago IL” → sim = 0.429"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  → Best sim: **0.429** | Matched elements: **1**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "S[1] matched:"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "     • R[2] “77 5th St Chicago IL” → sim = 0.429"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  → Best sim: **0.429** | Matched elements: **1**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "S[3] matched:"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "     • R[1] “5th St 02115 Seattle WA” → sim = 1.000"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "     • R[2] “77 5th St Chicago IL” → sim = 0.429"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  → Best sim: **1.000** | Matched elements: **2**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "filtered_cands, match_map = cand_sel.check_filter(\n",
+    "    tokenized_R, set(signature), initial_cands, index\n",
+    ")\n",
+    "display(Markdown(f\"**Surviving after check filter:** **{sorted(filtered_cands)}**\"))\n",
+    "for j in sorted(filtered_cands):\n",
+    "    display(Markdown(f\"S[{j}] matched:\"))\n",
+    "    for r_idx, sim in match_map[j].items():\n",
+    "        sim_text = f\"{sim:.3f}\"\n",
+    "        display(Markdown(f\"     • R[{r_idx}] “{reference_set[r_idx]}” → sim = {sim_text}\"))\n",
+    "    \n",
+    "    matches = match_map.get(j, {})\n",
+    "    if matches:\n",
+    "        best_sim = max(matches.values())\n",
+    "        num_matches = len(matches)\n",
+    "        display(Markdown(f\"  → Best sim: **{best_sim:.3f}** | Matched elements: **{num_matches}**\"))\n",
+    "    else:\n",
+    "        display(Markdown(f\"No elements passed similarity checks.\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cc37bb7f",
+   "metadata": {},
+   "source": [
+    "### 6. Nearest‑Neighbor Filter\n",
+    "\n",
+    "Further prunes via nearest‑neighbor upper bounds on total matching score.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "aa9b7a63",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "- Surviving after NN filter: **[3]**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "nn_filtered = cand_sel.nn_filter(\n",
+    "    tokenized_R, set(signature), filtered_cands,\n",
+    "    index, threshold=δ, match_map=match_map\n",
+    ")\n",
+    "display(Markdown(f\"- Surviving after NN filter: **{sorted(nn_filtered)}**\"))\n",
+    "for j in nn_filtered:\n",
+    "    display(Markdown(f\"  - S[{j}]: “{' | '.join(source_sets[j])}”\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8638f83a",
+   "metadata": {},
+   "source": [
+    "### 7. Verification\n",
+    "\n",
+    "Runs the bipartite max‑matching on the remaining candidates and outputs the final related sets.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "ebdf20fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "Final related sets (score ≥ 0.7):"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "  • S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle” → **0.743**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "verifier = Verifier(δ, contain, jaccard_similarity, sim_thresh=α, reduction=False)\n",
+    "results = verifier.get_related_sets(tokenized_R, nn_filtered, index)\n",
+    "\n",
+    "if results:\n",
+    "    display(Markdown(f\"Final related sets (score ≥ {δ}):\"))\n",
+    "    for j, score in results:\n",
+    "        display(Markdown(f\"  • S[{j}]: “{' | '.join(source_sets[j])}” → **{score:.3f}**\"))\n",
+    "else:\n",
+    "    display(Markdown(\"- No sets passed verification.\"))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "silkmoth_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docu/experiments/README.md
+++ b/docu/experiments/README.md
@@ -0,0 +1,155 @@
+### 🧪 Running the Experiments
+
+This project includes multiple experiments to evaluate the performance and accuracy of our Python implementation of **SilkMoth**.
+
+---
+
+#### 📊 1. Experiment Types
+
+You can replicate and customize the following types of experiments using different configurations (e.g., filters, signature strategies, reduction techniques):
+
+- **String Matching (DBLP Publication Titles)**
+- **Schema Matching (WebTables)**
+- **Inclusion Dependency Discovery (WebTable Columns)**
+
+Exact descriptions can be found in the official paper.
+
+---
+
+#### 📦 2. WebSchema Inclusion Dependency Setup
+
+To run the **WebSchema + Inclusion Dependency** experiments:
+
+1. Download the pre-extracted dataset from  
+   [📥 this link](https://tubcloud.tu-berlin.de/s/D4ngEfdn3cJ3pxF).
+2. Place the `.json` files in the `data/webtables/` directory  
+   *(create the folder if it does not exist)*.
+
+---
+
+#### 🚀 3. Running the Experiments
+
+To execute the core experiments from the paper:
+
+```bash
+python run.py
+```
+
+### 📈 4. Results Overview
+
+We compared our results with those presented in the original SilkMoth paper.  
+Although exact reproduction is not possible due to language differences (Python vs C++) and dataset variations, overall **performance trends align well**.
+
+All the results can be found in the folder  `results`. 
+
+The **left** diagrams are from the paper and the **right** are ours.
+
+> 💡 *Recent performance enhancements leverage `scipy`’s C-accelerated matching, replacing the original `networkx`-based approach.  
+> Unless otherwise specified, the diagrams shown are generated using the `networkx` implementation.*
+
+
+---
+
+### 🔍 Inclusion Dependency
+
+> **Goal**: Check if each reference set is contained within source sets.
+
+**Filter Comparison**  
+<p align="center">
+  <img src="silkmoth_results/inclusion_dep_filter.png" alt="Our Result" width="45%" />
+  <img src="results/inclusion_dependency/inclusion_dependency_filter_experiment_α=0.5.png" alt="Original Result" width="45%" />
+</p>
+
+**Signature Comparison**  
+<p align="center">
+  <img src="silkmoth_results/inclusion_dep_sig.png" alt="Our Result" width="45%" />
+  <img src="results/inclusion_dependency/inclusion_dependency_sig_experiment_α=0.5.png" alt="Original Result" width="45%" />
+</p>
+
+**Reduction Comparison**  
+<p align="center">
+  <img src="silkmoth_results/inclusion_dep_red.png" alt="Our Result" width="45%" />
+  <img src="results/inclusion_dependency/inclusion_dependency_reduction_experiment_α=0.0.png" alt="Original Result" width="45%" />
+</p>
+
+**Scalability**  
+<p align="center">
+  <img src="silkmoth_results/inclusion_dep_scal.png" alt="Our Result" width="45%" />
+  <img src="results/inclusion_dependency/inclusion_dependency_scalability_experiment_α=0.5.png" alt="Original Result" width="45%" />
+</p>
+
+---
+
+### 🔍 Schema Matching (WebTables)
+
+> **Goal**: Detect related set pairs within a single source set.
+
+**Filter Comparison**  
+<p align="center">
+  <img src="silkmoth_results/schema_matching_filter.png" alt="Our Result" width="45%" />
+  <img src="results/schema_matching/schema_matching_filter_experiment_α=0.png" alt="Original Result" width="45%" />
+</p>
+
+**Signature Comparison**  
+<p align="center">
+  <img src="silkmoth_results/schema_matching_sig.png" alt="Our Result" width="45%" />
+  <img src="results/schema_matching/schema_matching_sig_experiment_α=0.0.png" alt="Original Result" width="45%" />
+</p>
+
+**Scalability**  
+<p align="center">
+  <img src="silkmoth_results/schema_matching_scal.png" alt="Our Result" width="45%" />
+  <img src="results/schema_matching/schema_matching_scalability_experiment_α=0.0.png" alt="Original Result" width="45%" />
+</p>
+
+---
+
+### 🔍 String Matching (DBLP Publication Titles)
+>**Goal:** Detect related titles within the dataset using the extended SilkMoth pipeline
+based on **edit similarity** and **q-gram** tokenization.
+> SciPy was used here.
+
+**Filter Comparison**  
+<p align="center">
+  <img src="silkmoth_results/string_matching_filter.png" alt="Our Result" width="45%" />
+  <img src="results/string_matching/10k-set-size/string_matching_filter_experiment_α=0.8.png" alt="Original Result" width="45%" />
+</p>
+
+**Signature Comparison**  
+<p align="center">
+  <img src="silkmoth_results/string_matching_sig.png" alt="Our Result" width="45%" />
+  <img src="results/string_matching/10k-set-size/string_matching_sig_experiment_α=0.8.png" alt="Original Result" width="45%" />
+</p>
+
+**Scalability**  
+<p align="center">
+  <img src="silkmoth_results/string_matching_scal.png" alt="Our Result" width="45%" />
+  <img src="results/string_matching/string_matching_scalability_experiment_α=0.8.png" alt="Original Result" width="45%" />
+</p>
+---
+
+### 🔍 Additional: Inclusion Dependency SilkMoth Filter compared with no SilkMoth
+
+> In this analysis, we focus exclusively on SilkMoth. But how does it compare to a 
+> brute-force approach that skips the SilkMoth pipeline entirely? The graph below 
+> shows the Filter run alongside the brute-force bipartite matching method without any 
+> optimization pipeline. The results clearly demonstrate a dramatic improvement 
+> in runtime efficiency when using SilkMoth.
+
+
+<img src="results/inclusion_dependency/inclusion_dependency_filter_combined_raw_experiment_α=0.5.png" alt="WebTables Result"  />
+
+
+---
+
+### 🔍 Additional: Schema Matching with GitHub WebTables
+
+> Similar to Schema Matching, this experiment uses a GitHub WebTable as a fixed reference set and matches it against other sets. The goal is to evaluate SilkMoth’s performance across different domains.
+**Left:** Matching with one reference set.
+**Right:** Matching with WebTable Corpus and GitHub WebTable datasets.
+The results show no significant difference, indicating consistent behavior across varying datasets.
+
+<p align="center">
+  <img src="results/schema_matching/schema_matching_filter_experiment_α=0.5.png" alt="WebTables Result" width="45%" />
+  <img src="results/schema_matching/github_webtable_schema_matching_experiment_α=0.5.png" alt="GitHub Table Result" width="45%" />
+</p>
--- a/docu/experiments/results/inclusion_dependency/inclusion_dependency_filter_combined_raw_experiment_α=0.5.png
+++ b/docu/experiments/results/inclusion_dependency/inclusion_dependency_filter_combined_raw_experiment_α=0.5.png
--- a/docu/experiments/results/inclusion_dependency/inclusion_dependency_filter_experiment_α=0.5.png
+++ b/docu/experiments/results/inclusion_dependency/inclusion_dependency_filter_experiment_α=0.5.png
--- a/docu/experiments/results/inclusion_dependency/inclusion_dependency_reduction_experiment_α=0.0.png
+++ b/docu/experiments/results/inclusion_dependency/inclusion_dependency_reduction_experiment_α=0.0.png
--- a/docu/experiments/results/inclusion_dependency/inclusion_dependency_scalability_experiment_α=0.5.png
+++ b/docu/experiments/results/inclusion_dependency/inclusion_dependency_scalability_experiment_α=0.5.png
--- a/docu/experiments/results/inclusion_dependency/inclusion_dependency_sig_experiment_α=0.5.png
+++ b/docu/experiments/results/inclusion_dependency/inclusion_dependency_sig_experiment_α=0.5.png
--- a/docu/experiments/results/plot.py
+++ b/docu/experiments/results/plot.py
@@ -0,0 +1,64 @@
+from experiments.utils import plot_elapsed_times
+import csv
+
+import csv
+
+labels = []
+elapsed_times = []
+
+def read_csv_add_data(filename, labels, elapsed_times):
+    with open(filename, newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader)  # skip header
+        times = []
+        current_label = None
+        for row in reader:
+            sim_thresh = float(row[0])
+            label = row[4]
+            elapsed = float(row[5])
+
+            if sim_thresh == 0.5:
+                if current_label != label:
+                    # New label group started
+                    if times:
+                        # Save times of previous label if not empty
+                        elapsed_times.append(times)
+                    times = [elapsed]
+                    current_label = label
+                else:
+                    times.append(elapsed)
+
+                # When 4 times collected, append and reset
+                if len(times) == 4:
+                    elapsed_times.append(times)
+                    times = []
+                    current_label = None
+
+            if label not in labels:
+                labels.append(label)
+
+        # In case last label times were not appended
+        if times:
+            elapsed_times.append(times)
+
+# Read first CSV
+read_csv_add_data('inclusion_dependency/raw_matching_experiment_results.csv', labels, elapsed_times)
+
+# Read second CSV
+read_csv_add_data('inclusion_dependency/inclusion_dependency_filter_experiment_results.csv', labels, elapsed_times)
+
+print("Labels:", labels)
+print("Elapsed Times:", elapsed_times)
+
+# Then plot
+file_name_prefix = "inclusion_dependency_filter_combined_raw"
+folder_path = ""
+
+_ = plot_elapsed_times(
+    related_thresholds=[0.7, 0.75, 0.8, 0.85],
+    elapsed_times_list=elapsed_times,
+    fig_text=f"{file_name_prefix} (α = 0.5)",
+    legend_labels=labels,
+    file_name=f"{folder_path}{file_name_prefix}_experiment_α=0.5.png"
+)
+
--- a/docu/experiments/results/schema_matching/github_webtable_schema_matching_experiment_α=0.5.png
+++ b/docu/experiments/results/schema_matching/github_webtable_schema_matching_experiment_α=0.5.png
--- a/docu/experiments/results/schema_matching/schema_matching_filter_experiment_α=0.5.png
+++ b/docu/experiments/results/schema_matching/schema_matching_filter_experiment_α=0.5.png
--- a/docu/experiments/results/schema_matching/schema_matching_filter_experiment_α=0.png
+++ b/docu/experiments/results/schema_matching/schema_matching_filter_experiment_α=0.png
--- a/docu/experiments/results/schema_matching/schema_matching_scalability_experiment_α=0.0.png
+++ b/docu/experiments/results/schema_matching/schema_matching_scalability_experiment_α=0.0.png
--- a/docu/experiments/results/schema_matching/schema_matching_sig_experiment_α=0.0.png
+++ b/docu/experiments/results/schema_matching/schema_matching_sig_experiment_α=0.0.png
--- a/docu/experiments/results/string_matching/10k-set-size/string_matching_filter_experiment_α=0.8.png
+++ b/docu/experiments/results/string_matching/10k-set-size/string_matching_filter_experiment_α=0.8.png
--- a/docu/experiments/results/string_matching/10k-set-size/string_matching_sig_experiment_α=0.8.png
+++ b/docu/experiments/results/string_matching/10k-set-size/string_matching_sig_experiment_α=0.8.png
--- a/docu/experiments/results/string_matching/string_matching_scalability_experiment_α=0.8.png
+++ b/docu/experiments/results/string_matching/string_matching_scalability_experiment_α=0.8.png
--- a/docu/experiments/silkmoth_results/inclusion_dep_filter.png
+++ b/docu/experiments/silkmoth_results/inclusion_dep_filter.png
--- a/docu/experiments/silkmoth_results/inclusion_dep_red.png
+++ b/docu/experiments/silkmoth_results/inclusion_dep_red.png
--- a/docu/experiments/silkmoth_results/inclusion_dep_scal.png
+++ b/docu/experiments/silkmoth_results/inclusion_dep_scal.png
--- a/docu/experiments/silkmoth_results/inclusion_dep_sig.png
+++ b/docu/experiments/silkmoth_results/inclusion_dep_sig.png
--- a/docu/experiments/silkmoth_results/schema_matching_filter.png
+++ b/docu/experiments/silkmoth_results/schema_matching_filter.png
--- a/docu/experiments/silkmoth_results/schema_matching_scal.png
+++ b/docu/experiments/silkmoth_results/schema_matching_scal.png
--- a/docu/experiments/silkmoth_results/schema_matching_sig.png
+++ b/docu/experiments/silkmoth_results/schema_matching_sig.png
--- a/docu/experiments/silkmoth_results/string_matching_filter.png
+++ b/docu/experiments/silkmoth_results/string_matching_filter.png
--- a/docu/experiments/silkmoth_results/string_matching_scal.png
+++ b/docu/experiments/silkmoth_results/string_matching_scal.png
--- a/docu/experiments/silkmoth_results/string_matching_sig.png
+++ b/docu/experiments/silkmoth_results/string_matching_sig.png
--- a/docu/figures/InvertedIndex.png
+++ b/docu/figures/InvertedIndex.png
--- a/docu/figures/Pipeline.png
+++ b/docu/figures/Pipeline.png
--- a/docu/index.md
+++ b/docu/index.md
@@ -0,0 +1,151 @@
+# 🦋 LSDIPro SS2025
+
+## 📄 [SilkMoth: An Efficient Method for Finding Related Sets](https://doi.org/10.14778/3115404.3115413)
+
+A project inspired by the SilkMoth paper, exploring efficient techniques for related set discovery.
+
+---
+
+## 👥 Team Members
+- **Andreas Wilms**
+- **Sarra Daknou**
+- **Amina Iqbal**
+- **Jakob Berschneider**
+
+---
+
+## 📊 Experiments & Results
+➡️ [**See Experiments**](experiments/README.md)
+
+---
+
+## 🧪 Interactive Demo
+
+Follow our **step-by-step Jupyter Notebook demo** for a hands-on understanding of SilkMoth
+
+📓 [**Open demo_example.ipynb**](demo_example.ipynb)
+
+---
+
+## Table of Contents
+
+- [1. Large Scale Data Integration Project (LSDIPro)](#1-large-scale-data-integration-project-lsdipro)  
+- [2. What is SilkMoth? 🐛](#2-what-is-silkmoth)  
+- [3. The Problem 🧩](#3-the-problem)  
+- [4. SilkMoth’s Solution 🚀](#4-silkmoths-solution)  
+- [5. Core Pipeline Steps 🔁](#5-core-pipeline-steps)  
+  - [5.1 Tokenization](#51-tokenization)  
+  - [5.2 Inverted Index Construction](#52-inverted-index-construction)  
+  - [5.3 Signature Generation](#53-signature-generation)  
+  - [5.4 Candidate Selection](#54-candidate-selection)  
+  - [5.5 Refinement Filters](#55-refinement-filters)  
+  - [5.6 Verification via Maximum Matching](#56-verification-via-maximum-matching)  
+- [6. Modes of Operation 🧪](#6-modes-of-operation-)  
+- [7. Supported Similarity Functions 📐](#7-supported-similarity-functions-)  
+- [8. Installing from Source](#8-installing-from-source)  
+- [9. Experiment Results](#9-experiment-results)  
+
+---
+
+## 1. Large Scale Data Integration Project (LSDIPro)
+
+As part of the university project LSDIPro, our team implemented the SilkMoth paper in Python.  
+The course focuses on large-scale data integration, where student groups reproduce and extend research prototypes.  
+The project emphasizes scalable algorithm design, evaluation, and handling heterogeneous data at scale.
+
+---
+
+## 2. What is SilkMoth?
+
+**SilkMoth** is a system designed to efficiently discover related sets in large collections of data, even when the elements within those sets are only approximately similar.  
+This is especially important in **data integration**, **data cleaning**, and **information retrieval**, where messy or inconsistent data is common.
+
+---
+
+## 3. The Problem
+
+Determining whether two sets are related, for example, whether two database columns should be joined, often involves comparing their elements using **similarity functions** (not just exact matches).  
+A powerful approach models this as a **bipartite graph** and finds the **maximum matching score** between elements. However, this method is **computationally expensive** (`O(n³)` per pair), making it impractical for large datasets.
+
+---
+
+## 4. SilkMoth’s Solution
+
+SilkMoth tackles this with a three-step approach:
+
+1. **Signature Generation**: Creates compact signatures for each set, ensuring related sets share signature parts.  
+2. **Pruning**: Filters out unrelated sets early, reducing candidates.  
+3. **Verification**: Applies the costly matching metric only on remaining candidates, matching brute-force accuracy but faster.
+
+---
+
+## 5. Core Pipeline Steps
+
+![Figure 1: SILKMOTH Framework Overview](figures/Pipeline.png)
+
+*Figure 1. SILKMOTH pipeline framework. Source: Deng et al., "SILKMOTH: An Efficient Method for Finding Related Sets with Maximum Matching Constraints", VLDB 2017. Licensed under CC BY-NC-ND 4.0.*
+
+### [5.1 Tokenization](pages/tokenizer.md)
+
+Each element in every set is tokenized based on the selected similarity function:  
+- **Jaccard Similarity**: Elements are split into whitespace-delimited tokens.  
+- **Edit Similarity**: Elements are split into overlapping `q`-grams (e.g., 3-grams).
+
+### [5.2 Inverted Index Construction](pages/inverted_index.md)
+
+An **inverted index** is built from the reference set `R` to map each token to a list of `(set, element)` pairs in which it occurs.  
+This allows fast lookup of candidate sets sharing tokens with a query.
+
+### [5.3 Signature Generation](pages/signature_generator.md)
+
+A **signature** is a subset of tokens selected from each set such that:  
+- Any related set must share at least one signature token.  
+- Signature size is minimized to reduce candidate space.
+
+Signature selection heuristics (e.g., cost/value greedy ranking) approximate the optimal valid signature, which is NP-complete to compute exactly.
+
+### [5.4 Candidate Selection](pages/candidate_selector.md)
+
+For each set `R`, retrieve from the inverted index all sets `S` sharing at least one token with `R`’s signature. These become **candidate sets** for further evaluation.
+
+### [5.5 Refinement Filters](pages/candidate_selector.md)
+
+Two filters reduce false positives among candidates:  
+- **Check Filter**: Uses an upper bound on similarity to eliminate sets below threshold.  
+- **Nearest Neighbor Filter**: Approximates maximum matching score using nearest neighbor similarity for each element in `R`.
+
+### [5.6 Verification via Maximum Matching](pages/verifier.md)
+
+Compute **maximum weighted bipartite matching** between elements of `R` and `S` for remaining candidates using the similarity function as edge weights.  
+Sets meeting or exceeding threshold `δ` are considered **related**.
+
+---
+
+## 6. Modes of Operation 🧪
+
+- **Discovery Mode**: Compare all pairs of sets to find all related pairs.  
+  *Use case:* Finding related columns in databases.
+
+- **Search Mode**: Given a reference set, find all related sets.  
+  *Use case:* Schema matching or entity deduplication.
+
+---
+
+## 7. Supported Similarity Functions 📐
+
+- **Jaccard Similarity**  
+- **Edit Similarity** (Levenshtein-based)  
+- Optional minimum similarity threshold `α` on element comparisons.
+
+---
+
+## 8. Installing from Source
+
+1. Run `pip install src/` to install  
+
+---
+
+
+## 9. Experiment Results
+
+[📊 See Experiments and Results](experiments/README.md)
--- a/docu/pages/candidate_selector.md
+++ b/docu/pages/candidate_selector.md
@@ -0,0 +1,4 @@
+::: silkmoth.candidate_selector
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/pages/inverted_index.md
+++ b/docu/pages/inverted_index.md
@@ -0,0 +1,4 @@
+::: silkmoth.inverted_index
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/pages/signature_generator.md
+++ b/docu/pages/signature_generator.md
@@ -0,0 +1,4 @@
+::: silkmoth.signature_generator
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/pages/silkmoth_engine.md
+++ b/docu/pages/silkmoth_engine.md
@@ -0,0 +1,4 @@
+::: silkmoth.silkmoth_engine
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/pages/tokenizer.md
+++ b/docu/pages/tokenizer.md
@@ -0,0 +1,4 @@
+::: silkmoth.tokenizer
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/pages/utils.md
+++ b/docu/pages/utils.md
@@ -0,0 +1,4 @@
+::: silkmoth.utils
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/pages/verifier.md
+++ b/docu/pages/verifier.md
@@ -0,0 +1,4 @@
+::: silkmoth.verifier
+    rendering:
+      show_signature: true
+      show_source: true
--- a/docu/write_modules.py
+++ b/docu/write_modules.py
@@ -0,0 +1,20 @@
+import glob, os
+
+MODULES = glob.glob("src/silkmoth/*.py")
+OUT_DIR = "docu/pages"
+
+os.makedirs(OUT_DIR, exist_ok=True)
+
+for path in MODULES:
+    name = os.path.splitext(os.path.basename(path))[0]
+    if name == "__init__":
+        continue
+
+    doc_path = os.path.join(OUT_DIR, f"{name}.md")
+    with open(doc_path, "w") as f:
+        f.write("::: silkmoth." + name + "\n")
+        f.write("    rendering:\n")
+        f.write("      show_signature: true\n")
+        f.write("      show_source: true\n")
+
+