init
Update README.md
8
docu/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
### Generating Documentation Page
|
||||
|
||||
To generate a [documentation page](https://berscjak.github.io/) from source code with mkdocs, run the following from root directory:
|
||||
|
||||
```
|
||||
pip install mkdocs mkdocstrings[python] mkdocs-awesome-pages-plugin
|
||||
mkdocs serve
|
||||
```
|
||||
823
docu/demo_example.ipynb
Normal file
@@ -0,0 +1,823 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c9f89a47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## SilkMoth Demo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2ca15800",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Related Set Discovery task under Set‑Containment using Jaccard Similarity"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea6ce5fb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Import of all required modules:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "bdd1b92c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append(\"src\")\n",
|
||||
"\n",
|
||||
"from silkmoth.tokenizer import Tokenizer\n",
|
||||
"from silkmoth.inverted_index import InvertedIndex\n",
|
||||
"from silkmoth.signature_generator import SignatureGenerator\n",
|
||||
"from silkmoth.candidate_selector import CandidateSelector\n",
|
||||
"from silkmoth.verifier import Verifier\n",
|
||||
"from silkmoth.silkmoth_engine import SilkMothEngine\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from silkmoth.utils import jaccard_similarity, contain, edit_similarity, similar, SigType\n",
|
||||
"\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from IPython.display import display, Markdown\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bf6bf1f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Define example related dataset from \"SilkMoth\" paper (reference set **R** and source sets **S**)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "598a4bbf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Reference set (R):**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- R[0]: “77 Mass Ave Boston MA”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- R[1]: “5th St 02115 Seattle WA”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- R[2]: “77 5th St Chicago IL”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Source sets (S):**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Location Dataset\n",
|
||||
"reference_set = [\n",
|
||||
" '77 Mass Ave Boston MA',\n",
|
||||
" '5th St 02115 Seattle WA',\n",
|
||||
" '77 5th St Chicago IL'\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Address Dataset\n",
|
||||
"source_sets = [\n",
|
||||
" ['Mass Ave St Boston 02115','77 Mass 5th St Boston','77 Mass Ave 5th 02115'],\n",
|
||||
" ['77 Boston MA','77 5th St Boston 02115','77 Mass Ave 02115 Seattle'],\n",
|
||||
" ['77 Mass Ave 5th Boston MA','Mass Ave Chicago IL','77 Mass Ave St'],\n",
|
||||
" ['77 Mass Ave MA','5th St 02115 Seattle WA','77 5th St Boston Seattle']\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# thresholds & q\n",
|
||||
"δ = 0.7\n",
|
||||
"α = 0.0\n",
|
||||
"q = 3\n",
|
||||
"\n",
|
||||
"display(Markdown(\"**Reference set (R):**\"))\n",
|
||||
"for i, r in enumerate(reference_set):\n",
|
||||
" display(Markdown(f\"- R[{i}]: “{r}”\"))\n",
|
||||
"display(Markdown(\"**Source sets (S):**\"))\n",
|
||||
"for j, S in enumerate(source_sets):\n",
|
||||
" display(Markdown(f\"- S[{j}]: “{' | '.join(S)}”\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a50b350a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 1. Tokenization\n",
|
||||
"Tokenize each element of R and each S using Jaccard Similarity (whitespace tokens)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "55e7b5d0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Tokenized Reference set (R):**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of R[0]: {'Ave', 'MA', '77', 'Boston', 'Mass'}"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of R[1]: {'5th', 'Seattle', 'St', 'WA', '02115'}"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of R[2]: {'77', '5th', 'IL', 'St', 'Chicago'}"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Tokenized Source sets (S):**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of S[0]: [{'Ave', 'Boston', 'St', 'Mass', '02115'}, {'77', 'Boston', '5th', 'St', 'Mass'}, {'Ave', '77', '5th', 'Mass', '02115'}]"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of S[1]: [{'Boston', 'MA', '77'}, {'77', 'Boston', '5th', 'St', '02115'}, {'Ave', '77', 'Seattle', 'Mass', '02115'}]"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of S[2]: [{'Ave', 'MA', '77', 'Boston', '5th', 'Mass'}, {'IL', 'Ave', 'Mass', 'Chicago'}, {'St', 'Ave', 'Mass', '77'}]"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Tokens of S[3]: [{'Ave', 'Mass', '77', 'MA'}, {'5th', 'Seattle', 'St', 'WA', '02115'}, {'77', 'Boston', '5th', 'Seattle', 'St'}]"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = Tokenizer(jaccard_similarity, q)\n",
|
||||
"tokenized_R = tokenizer.tokenize(reference_set)\n",
|
||||
"tokenized_S = [tokenizer.tokenize(S) for S in source_sets]\n",
|
||||
"\n",
|
||||
"display(Markdown(\"**Tokenized Reference set (R):**\"))\n",
|
||||
"for i, toks in enumerate(tokenized_R):\n",
|
||||
" display(Markdown(f\"- Tokens of R[{i}]: {toks}\"))\n",
|
||||
"\n",
|
||||
"display(Markdown(\"**Tokenized Source sets (S):**\"))\n",
|
||||
"for i, toks in enumerate(tokenized_S):\n",
|
||||
" display(Markdown(f\"- Tokens of S[{i}]: {toks}\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e17b807b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 2. Build Inverted Index\n",
|
||||
"Builds an inverted index on the tokenized source sets and shows an example lookup."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "22c7d1d6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Index built over 4 source sets."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Example: token “Mass” appears in [(0, 0), (0, 1), (0, 2), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0)]"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"index = InvertedIndex(tokenized_S)\n",
|
||||
"display(Markdown(f\"- Index built over {len(source_sets)} source sets.\"))\n",
|
||||
"display(Markdown(f\"- Example: token “Mass” appears in {index.get_indexes('Mass')}\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc17daac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 3. Signature Generation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1c48bac2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Generates the weighted signature for R given δ, α (here α=0), using Jaccard Similarity."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "a36be65c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Selected signature tokens: **['Chicago', 'WA', 'IL', '5th']**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sig_gen = SignatureGenerator()\n",
|
||||
"signature = sig_gen.get_signature(\n",
|
||||
" tokenized_R, index,\n",
|
||||
" delta=δ, alpha=α,\n",
|
||||
" sig_type=SigType.WEIGHTED,\n",
|
||||
" sim_fun=jaccard_similarity,\n",
|
||||
" q=q\n",
|
||||
")\n",
|
||||
"display(Markdown(f\"- Selected signature tokens: **{signature}**\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "938be3e2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 4. Initial Candidate Selection\n",
|
||||
"\n",
|
||||
"Looks up each signature token in the inverted index to form the candidate set.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "58017e27",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Candidate set indices: **[0, 1, 2, 3]**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" - S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" - S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" - S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cand_sel = CandidateSelector(\n",
|
||||
" similarity_func=jaccard_similarity,\n",
|
||||
" sim_metric=contain,\n",
|
||||
" related_thresh=δ,\n",
|
||||
" sim_thresh=α,\n",
|
||||
" q=q\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"initial_cands = cand_sel.get_candidates(signature, index, len(tokenized_R))\n",
|
||||
"display(Markdown(f\"- Candidate set indices: **{sorted(initial_cands)}**\"))\n",
|
||||
"for j in sorted(initial_cands):\n",
|
||||
" display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d633e5f9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 5. Check Filter\n",
|
||||
"Prunes candidates by ensuring each matched element passes the local similarity bound.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "9a2bfdeb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Surviving after check filter:** **[0, 1, 3]**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"S[0] matched:"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" → Best sim: **0.429** | Matched elements: **1**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"S[1] matched:"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" → Best sim: **0.429** | Matched elements: **1**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"S[3] matched:"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" • R[1] “5th St 02115 Seattle WA” → sim = 1.000"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" → Best sim: **1.000** | Matched elements: **2**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"filtered_cands, match_map = cand_sel.check_filter(\n",
|
||||
" tokenized_R, set(signature), initial_cands, index\n",
|
||||
")\n",
|
||||
"display(Markdown(f\"**Surviving after check filter:** **{sorted(filtered_cands)}**\"))\n",
|
||||
"for j in sorted(filtered_cands):\n",
|
||||
" display(Markdown(f\"S[{j}] matched:\"))\n",
|
||||
" for r_idx, sim in match_map[j].items():\n",
|
||||
" sim_text = f\"{sim:.3f}\"\n",
|
||||
" display(Markdown(f\" • R[{r_idx}] “{reference_set[r_idx]}” → sim = {sim_text}\"))\n",
|
||||
" \n",
|
||||
" matches = match_map.get(j, {})\n",
|
||||
" if matches:\n",
|
||||
" best_sim = max(matches.values())\n",
|
||||
" num_matches = len(matches)\n",
|
||||
" display(Markdown(f\" → Best sim: **{best_sim:.3f}** | Matched elements: **{num_matches}**\"))\n",
|
||||
" else:\n",
|
||||
" display(Markdown(f\"No elements passed similarity checks.\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc37bb7f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 6. Nearest‑Neighbor Filter\n",
|
||||
"\n",
|
||||
"Further prunes via nearest‑neighbor upper bounds on total matching score.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"id": "aa9b7a63",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"- Surviving after NN filter: **[3]**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nn_filtered = cand_sel.nn_filter(\n",
|
||||
" tokenized_R, set(signature), filtered_cands,\n",
|
||||
" index, threshold=δ, match_map=match_map\n",
|
||||
")\n",
|
||||
"display(Markdown(f\"- Surviving after NN filter: **{sorted(nn_filtered)}**\"))\n",
|
||||
"for j in nn_filtered:\n",
|
||||
" display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8638f83a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 7. Verification\n",
|
||||
"\n",
|
||||
"Runs the bipartite max‑matching on the remaining candidates and outputs the final related sets.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "ebdf20fe",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"Final related sets (score ≥ 0.7):"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
" • S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle” → **0.743**"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"verifier = Verifier(δ, contain, jaccard_similarity, sim_thresh=α, reduction=False)\n",
|
||||
"results = verifier.get_related_sets(tokenized_R, nn_filtered, index)\n",
|
||||
"\n",
|
||||
"if results:\n",
|
||||
" display(Markdown(f\"Final related sets (score ≥ {δ}):\"))\n",
|
||||
" for j, score in results:\n",
|
||||
" display(Markdown(f\" • S[{j}]: “{' | '.join(source_sets[j])}” → **{score:.3f}**\"))\n",
|
||||
"else:\n",
|
||||
" display(Markdown(\"- No sets passed verification.\"))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "silkmoth_env",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
155
docu/experiments/README.md
Normal file
@@ -0,0 +1,155 @@
|
||||
### 🧪 Running the Experiments
|
||||
|
||||
This project includes multiple experiments to evaluate the performance and accuracy of our Python implementation of **SilkMoth**.
|
||||
|
||||
---
|
||||
|
||||
#### 📊 1. Experiment Types
|
||||
|
||||
You can replicate and customize the following types of experiments using different configurations (e.g., filters, signature strategies, reduction techniques):
|
||||
|
||||
- **String Matching (DBLP Publication Titles)**
|
||||
- **Schema Matching (WebTables)**
|
||||
- **Inclusion Dependency Discovery (WebTable Columns)**
|
||||
|
||||
Exact descriptions can be found in the official paper.
|
||||
|
||||
---
|
||||
|
||||
#### 📦 2. WebSchema Inclusion Dependency Setup
|
||||
|
||||
To run the **WebSchema + Inclusion Dependency** experiments:
|
||||
|
||||
1. Download the pre-extracted dataset from
|
||||
[📥 this link](https://tubcloud.tu-berlin.de/s/D4ngEfdn3cJ3pxF).
|
||||
2. Place the `.json` files in the `data/webtables/` directory
|
||||
*(create the folder if it does not exist)*.
|
||||
|
||||
---
|
||||
|
||||
#### 🚀 3. Running the Experiments
|
||||
|
||||
To execute the core experiments from the paper:
|
||||
|
||||
```bash
|
||||
python run.py
|
||||
```
|
||||
|
||||
### 📈 4. Results Overview
|
||||
|
||||
We compared our results with those presented in the original SilkMoth paper.
|
||||
Although exact reproduction is not possible due to language differences (Python vs C++) and dataset variations, overall **performance trends align well**.
|
||||
|
||||
All the results can be found in the folder `results`.
|
||||
|
||||
The **left** diagrams are from the paper and the **right** are ours.
|
||||
|
||||
> 💡 *Recent performance enhancements leverage `scipy`’s C-accelerated matching, replacing the original `networkx`-based approach.
|
||||
> Unless otherwise specified, the diagrams shown are generated using the `networkx` implementation.*
|
||||
|
||||
|
||||
---
|
||||
|
||||
### 🔍 Inclusion Dependency
|
||||
|
||||
> **Goal**: Check if each reference set is contained within source sets.
|
||||
|
||||
**Filter Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/inclusion_dep_filter.png" alt="Our Result" width="45%" />
|
||||
<img src="results/inclusion_dependency/inclusion_dependency_filter_experiment_α=0.5.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Signature Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/inclusion_dep_sig.png" alt="Our Result" width="45%" />
|
||||
<img src="results/inclusion_dependency/inclusion_dependency_sig_experiment_α=0.5.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Reduction Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/inclusion_dep_red.png" alt="Our Result" width="45%" />
|
||||
<img src="results/inclusion_dependency/inclusion_dependency_reduction_experiment_α=0.0.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Scalability**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/inclusion_dep_scal.png" alt="Our Result" width="45%" />
|
||||
<img src="results/inclusion_dependency/inclusion_dependency_scalability_experiment_α=0.5.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
### 🔍 Schema Matching (WebTables)
|
||||
|
||||
> **Goal**: Detect related set pairs within a single source set.
|
||||
|
||||
**Filter Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/schema_matching_filter.png" alt="Our Result" width="45%" />
|
||||
<img src="results/schema_matching/schema_matching_filter_experiment_α=0.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Signature Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/schema_matching_sig.png" alt="Our Result" width="45%" />
|
||||
<img src="results/schema_matching/schema_matching_sig_experiment_α=0.0.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Scalability**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/schema_matching_scal.png" alt="Our Result" width="45%" />
|
||||
<img src="results/schema_matching/schema_matching_scalability_experiment_α=0.0.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
### 🔍 String Matching (DBLP Publication Titles)
|
||||
>**Goal:** Detect related titles within the dataset using the extended SilkMoth pipeline
|
||||
based on **edit similarity** and **q-gram** tokenization.
|
||||
> SciPy was used here.
|
||||
|
||||
**Filter Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/string_matching_filter.png" alt="Our Result" width="45%" />
|
||||
<img src="results/string_matching/10k-set-size/string_matching_filter_experiment_α=0.8.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Signature Comparison**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/string_matching_sig.png" alt="Our Result" width="45%" />
|
||||
<img src="results/string_matching/10k-set-size/string_matching_sig_experiment_α=0.8.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
|
||||
**Scalability**
|
||||
<p align="center">
|
||||
<img src="silkmoth_results/string_matching_scal.png" alt="Our Result" width="45%" />
|
||||
<img src="results/string_matching/string_matching_scalability_experiment_α=0.8.png" alt="Original Result" width="45%" />
|
||||
</p>
|
||||
---
|
||||
|
||||
### 🔍 Additional: Inclusion Dependency SilkMoth Filter compared with no SilkMoth
|
||||
|
||||
> In this analysis, we focus exclusively on SilkMoth. But how does it compare to a
|
||||
> brute-force approach that skips the SilkMoth pipeline entirely? The graph below
|
||||
> shows the Filter run alongside the brute-force bipartite matching method without any
|
||||
> optimization pipeline. The results clearly demonstrate a dramatic improvement
|
||||
> in runtime efficiency when using SilkMoth.
|
||||
|
||||
|
||||
<img src="results/inclusion_dependency/inclusion_dependency_filter_combined_raw_experiment_α=0.5.png" alt="WebTables Result" />
|
||||
|
||||
|
||||
---
|
||||
|
||||
### 🔍 Additional: Schema Matching with GitHub WebTables
|
||||
|
||||
> Similar to Schema Matching, this experiment uses a GitHub WebTable as a fixed reference set and matches it against other sets. The goal is to evaluate SilkMoth’s performance across different domains.
|
||||
**Left:** Matching with one reference set.
|
||||
**Right:** Matching with WebTable Corpus and GitHub WebTable datasets.
|
||||
The results show no significant difference, indicating consistent behavior across varying datasets.
|
||||
|
||||
<p align="center">
|
||||
<img src="results/schema_matching/schema_matching_filter_experiment_α=0.5.png" alt="WebTables Result" width="45%" />
|
||||
<img src="results/schema_matching/github_webtable_schema_matching_experiment_α=0.5.png" alt="GitHub Table Result" width="45%" />
|
||||
</p>
|
||||
|
After Width: | Height: | Size: 125 KiB |
|
After Width: | Height: | Size: 151 KiB |
|
After Width: | Height: | Size: 166 KiB |
|
After Width: | Height: | Size: 241 KiB |
|
After Width: | Height: | Size: 207 KiB |
64
docu/experiments/results/plot.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from experiments.utils import plot_elapsed_times
|
||||
import csv
|
||||
|
||||
import csv
|
||||
|
||||
labels = []
|
||||
elapsed_times = []
|
||||
|
||||
def read_csv_add_data(filename, labels, elapsed_times):
|
||||
with open(filename, newline='') as csvfile:
|
||||
reader = csv.reader(csvfile)
|
||||
next(reader) # skip header
|
||||
times = []
|
||||
current_label = None
|
||||
for row in reader:
|
||||
sim_thresh = float(row[0])
|
||||
label = row[4]
|
||||
elapsed = float(row[5])
|
||||
|
||||
if sim_thresh == 0.5:
|
||||
if current_label != label:
|
||||
# New label group started
|
||||
if times:
|
||||
# Save times of previous label if not empty
|
||||
elapsed_times.append(times)
|
||||
times = [elapsed]
|
||||
current_label = label
|
||||
else:
|
||||
times.append(elapsed)
|
||||
|
||||
# When 4 times collected, append and reset
|
||||
if len(times) == 4:
|
||||
elapsed_times.append(times)
|
||||
times = []
|
||||
current_label = None
|
||||
|
||||
if label not in labels:
|
||||
labels.append(label)
|
||||
|
||||
# In case last label times were not appended
|
||||
if times:
|
||||
elapsed_times.append(times)
|
||||
|
||||
# Read first CSV
|
||||
read_csv_add_data('inclusion_dependency/raw_matching_experiment_results.csv', labels, elapsed_times)
|
||||
|
||||
# Read second CSV
|
||||
read_csv_add_data('inclusion_dependency/inclusion_dependency_filter_experiment_results.csv', labels, elapsed_times)
|
||||
|
||||
print("Labels:", labels)
|
||||
print("Elapsed Times:", elapsed_times)
|
||||
|
||||
# Then plot
|
||||
file_name_prefix = "inclusion_dependency_filter_combined_raw"
|
||||
folder_path = ""
|
||||
|
||||
_ = plot_elapsed_times(
|
||||
related_thresholds=[0.7, 0.75, 0.8, 0.85],
|
||||
elapsed_times_list=elapsed_times,
|
||||
fig_text=f"{file_name_prefix} (α = 0.5)",
|
||||
legend_labels=labels,
|
||||
file_name=f"{folder_path}{file_name_prefix}_experiment_α=0.5.png"
|
||||
)
|
||||
|
||||
|
After Width: | Height: | Size: 171 KiB |
|
After Width: | Height: | Size: 193 KiB |
|
After Width: | Height: | Size: 188 KiB |
|
After Width: | Height: | Size: 248 KiB |
|
After Width: | Height: | Size: 207 KiB |
|
After Width: | Height: | Size: 159 KiB |
|
After Width: | Height: | Size: 199 KiB |
|
After Width: | Height: | Size: 221 KiB |
BIN
docu/experiments/silkmoth_results/inclusion_dep_filter.png
Normal file
|
After Width: | Height: | Size: 37 KiB |
BIN
docu/experiments/silkmoth_results/inclusion_dep_red.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
BIN
docu/experiments/silkmoth_results/inclusion_dep_scal.png
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
docu/experiments/silkmoth_results/inclusion_dep_sig.png
Normal file
|
After Width: | Height: | Size: 47 KiB |
BIN
docu/experiments/silkmoth_results/schema_matching_filter.png
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
docu/experiments/silkmoth_results/schema_matching_scal.png
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
docu/experiments/silkmoth_results/schema_matching_sig.png
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
docu/experiments/silkmoth_results/string_matching_filter.png
Normal file
|
After Width: | Height: | Size: 44 KiB |
BIN
docu/experiments/silkmoth_results/string_matching_scal.png
Normal file
|
After Width: | Height: | Size: 51 KiB |
BIN
docu/experiments/silkmoth_results/string_matching_sig.png
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
docu/figures/InvertedIndex.png
Normal file
|
After Width: | Height: | Size: 62 KiB |
BIN
docu/figures/Pipeline.png
Normal file
|
After Width: | Height: | Size: 230 KiB |
151
docu/index.md
Normal file
@@ -0,0 +1,151 @@
|
||||
# 🦋 LSDIPro SS2025
|
||||
|
||||
## 📄 [SilkMoth: An Efficient Method for Finding Related Sets](https://doi.org/10.14778/3115404.3115413)
|
||||
|
||||
A project inspired by the SilkMoth paper, exploring efficient techniques for related set discovery.
|
||||
|
||||
---
|
||||
|
||||
## 👥 Team Members
|
||||
- **Andreas Wilms**
|
||||
- **Sarra Daknou**
|
||||
- **Amina Iqbal**
|
||||
- **Jakob Berschneider**
|
||||
|
||||
---
|
||||
|
||||
## 📊 Experiments & Results
|
||||
➡️ [**See Experiments**](experiments/README.md)
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Interactive Demo
|
||||
|
||||
Follow our **step-by-step Jupyter Notebook demo** for a hands-on understanding of SilkMoth
|
||||
|
||||
📓 [**Open demo_example.ipynb**](demo_example.ipynb)
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [1. Large Scale Data Integration Project (LSDIPro)](#1-large-scale-data-integration-project-lsdipro)
|
||||
- [2. What is SilkMoth? 🐛](#2-what-is-silkmoth)
|
||||
- [3. The Problem 🧩](#3-the-problem)
|
||||
- [4. SilkMoth’s Solution 🚀](#4-silkmoths-solution)
|
||||
- [5. Core Pipeline Steps 🔁](#5-core-pipeline-steps)
|
||||
- [5.1 Tokenization](#51-tokenization)
|
||||
- [5.2 Inverted Index Construction](#52-inverted-index-construction)
|
||||
- [5.3 Signature Generation](#53-signature-generation)
|
||||
- [5.4 Candidate Selection](#54-candidate-selection)
|
||||
- [5.5 Refinement Filters](#55-refinement-filters)
|
||||
- [5.6 Verification via Maximum Matching](#56-verification-via-maximum-matching)
|
||||
- [6. Modes of Operation 🧪](#6-modes-of-operation-)
|
||||
- [7. Supported Similarity Functions 📐](#7-supported-similarity-functions-)
|
||||
- [8. Installing from Source](#8-installing-from-source)
|
||||
- [9. Experiment Results](#9-experiment-results)
|
||||
|
||||
---
|
||||
|
||||
## 1. Large Scale Data Integration Project (LSDIPro)
|
||||
|
||||
As part of the university project LSDIPro, our team implemented the SilkMoth paper in Python.
|
||||
The course focuses on large-scale data integration, where student groups reproduce and extend research prototypes.
|
||||
The project emphasizes scalable algorithm design, evaluation, and handling heterogeneous data at scale.
|
||||
|
||||
---
|
||||
|
||||
## 2. What is SilkMoth?
|
||||
|
||||
**SilkMoth** is a system designed to efficiently discover related sets in large collections of data, even when the elements within those sets are only approximately similar.
|
||||
This is especially important in **data integration**, **data cleaning**, and **information retrieval**, where messy or inconsistent data is common.
|
||||
|
||||
---
|
||||
|
||||
## 3. The Problem
|
||||
|
||||
Determining whether two sets are related, for example, whether two database columns should be joined, often involves comparing their elements using **similarity functions** (not just exact matches).
|
||||
A powerful approach models this as a **bipartite graph** and finds the **maximum matching score** between elements. However, this method is **computationally expensive** (`O(n³)` per pair), making it impractical for large datasets.
|
||||
|
||||
---
|
||||
|
||||
## 4. SilkMoth’s Solution
|
||||
|
||||
SilkMoth tackles this with a three-step approach:
|
||||
|
||||
1. **Signature Generation**: Creates compact signatures for each set, ensuring related sets share signature parts.
|
||||
2. **Pruning**: Filters out unrelated sets early, reducing candidates.
|
||||
3. **Verification**: Applies the costly matching metric only on remaining candidates, matching brute-force accuracy but faster.
|
||||
|
||||
---
|
||||
|
||||
## 5. Core Pipeline Steps
|
||||
|
||||

|
||||
|
||||
*Figure 1. SILKMOTH pipeline framework. Source: Deng et al., "SILKMOTH: An Efficient Method for Finding Related Sets with Maximum Matching Constraints", VLDB 2017. Licensed under CC BY-NC-ND 4.0.*
|
||||
|
||||
### [5.1 Tokenization](pages/tokenizer.md)
|
||||
|
||||
Each element in every set is tokenized based on the selected similarity function:
|
||||
- **Jaccard Similarity**: Elements are split into whitespace-delimited tokens.
|
||||
- **Edit Similarity**: Elements are split into overlapping `q`-grams (e.g., 3-grams).
|
||||
|
||||
### [5.2 Inverted Index Construction](pages/inverted_index.md)
|
||||
|
||||
An **inverted index** is built from the reference set `R` to map each token to a list of `(set, element)` pairs in which it occurs.
|
||||
This allows fast lookup of candidate sets sharing tokens with a query.
|
||||
|
||||
### [5.3 Signature Generation](pages/signature_generator.md)
|
||||
|
||||
A **signature** is a subset of tokens selected from each set such that:
|
||||
- Any related set must share at least one signature token.
|
||||
- Signature size is minimized to reduce candidate space.
|
||||
|
||||
Signature selection heuristics (e.g., cost/value greedy ranking) approximate the optimal valid signature, which is NP-complete to compute exactly.
|
||||
|
||||
### [5.4 Candidate Selection](pages/candidate_selector.md)
|
||||
|
||||
For each set `R`, retrieve from the inverted index all sets `S` sharing at least one token with `R`’s signature. These become **candidate sets** for further evaluation.
|
||||
|
||||
### [5.5 Refinement Filters](pages/candidate_selector.md)
|
||||
|
||||
Two filters reduce false positives among candidates:
|
||||
- **Check Filter**: Uses an upper bound on similarity to eliminate sets below threshold.
|
||||
- **Nearest Neighbor Filter**: Approximates maximum matching score using nearest neighbor similarity for each element in `R`.
|
||||
|
||||
### [5.6 Verification via Maximum Matching](pages/verifier.md)
|
||||
|
||||
Compute **maximum weighted bipartite matching** between elements of `R` and `S` for remaining candidates using the similarity function as edge weights.
|
||||
Sets meeting or exceeding threshold `δ` are considered **related**.
|
||||
|
||||
---
|
||||
|
||||
## 6. Modes of Operation 🧪
|
||||
|
||||
- **Discovery Mode**: Compare all pairs of sets to find all related pairs.
|
||||
*Use case:* Finding related columns in databases.
|
||||
|
||||
- **Search Mode**: Given a reference set, find all related sets.
|
||||
*Use case:* Schema matching or entity deduplication.
|
||||
|
||||
---
|
||||
|
||||
## 7. Supported Similarity Functions 📐
|
||||
|
||||
- **Jaccard Similarity**
|
||||
- **Edit Similarity** (Levenshtein-based)
|
||||
- Optional minimum similarity threshold `α` on element comparisons.
|
||||
|
||||
---
|
||||
|
||||
## 8. Installing from Source
|
||||
|
||||
1. Run `pip install src/` to install
|
||||
|
||||
---
|
||||
|
||||
|
||||
## 9. Experiment Results
|
||||
|
||||
[📊 See Experiments and Results](experiments/README.md)
|
||||
4
docu/pages/candidate_selector.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.candidate_selector
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
4
docu/pages/inverted_index.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.inverted_index
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
4
docu/pages/signature_generator.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.signature_generator
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
4
docu/pages/silkmoth_engine.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.silkmoth_engine
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
4
docu/pages/tokenizer.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.tokenizer
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
4
docu/pages/utils.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.utils
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
4
docu/pages/verifier.md
Normal file
@@ -0,0 +1,4 @@
|
||||
::: silkmoth.verifier
|
||||
rendering:
|
||||
show_signature: true
|
||||
show_source: true
|
||||
20
docu/write_modules.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import glob, os
|
||||
|
||||
MODULES = glob.glob("src/silkmoth/*.py")
|
||||
OUT_DIR = "docu/pages"
|
||||
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
for path in MODULES:
|
||||
name = os.path.splitext(os.path.basename(path))[0]
|
||||
if name == "__init__":
|
||||
continue
|
||||
|
||||
doc_path = os.path.join(OUT_DIR, f"{name}.md")
|
||||
with open(doc_path, "w") as f:
|
||||
f.write("::: silkmoth." + name + "\n")
|
||||
f.write(" rendering:\n")
|
||||
f.write(" show_signature: true\n")
|
||||
f.write(" show_source: true\n")
|
||||
|
||||
|
||||