824 lines
20 KiB
Plaintext
824 lines
20 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c9f89a47",
|
||
"metadata": {},
|
||
"source": [
|
||
"## SilkMoth Demo"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "2ca15800",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Related Set Discovery task under Set‑Containment using Jaccard Similarity"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ea6ce5fb",
|
||
"metadata": {},
|
||
"source": [
|
||
"Import of all required modules:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "bdd1b92c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import sys\n",
|
||
"sys.path.append(\"src\")\n",
|
||
"\n",
|
||
"from silkmoth.tokenizer import Tokenizer\n",
|
||
"from silkmoth.inverted_index import InvertedIndex\n",
|
||
"from silkmoth.signature_generator import SignatureGenerator\n",
|
||
"from silkmoth.candidate_selector import CandidateSelector\n",
|
||
"from silkmoth.verifier import Verifier\n",
|
||
"from silkmoth.silkmoth_engine import SilkMothEngine\n",
|
||
"\n",
|
||
"\n",
|
||
"from silkmoth.utils import jaccard_similarity, contain, edit_similarity, similar, SigType\n",
|
||
"\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from IPython.display import display, Markdown\n",
|
||
"\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "bf6bf1f5",
|
||
"metadata": {},
|
||
"source": [
|
||
"Define example related dataset from \"SilkMoth\" paper (reference set **R** and source sets **S**)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "598a4bbf",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"**Reference set (R):**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- R[0]: “77 Mass Ave Boston MA”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- R[1]: “5th St 02115 Seattle WA”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- R[2]: “77 5th St Chicago IL”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"**Source sets (S):**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Location Dataset\n",
|
||
"reference_set = [\n",
|
||
" '77 Mass Ave Boston MA',\n",
|
||
" '5th St 02115 Seattle WA',\n",
|
||
" '77 5th St Chicago IL'\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Address Dataset\n",
|
||
"source_sets = [\n",
|
||
" ['Mass Ave St Boston 02115','77 Mass 5th St Boston','77 Mass Ave 5th 02115'],\n",
|
||
" ['77 Boston MA','77 5th St Boston 02115','77 Mass Ave 02115 Seattle'],\n",
|
||
" ['77 Mass Ave 5th Boston MA','Mass Ave Chicago IL','77 Mass Ave St'],\n",
|
||
" ['77 Mass Ave MA','5th St 02115 Seattle WA','77 5th St Boston Seattle']\n",
|
||
"]\n",
|
||
"\n",
|
||
"# thresholds & q\n",
|
||
"δ = 0.7\n",
|
||
"α = 0.0\n",
|
||
"q = 3\n",
|
||
"\n",
|
||
"display(Markdown(\"**Reference set (R):**\"))\n",
|
||
"for i, r in enumerate(reference_set):\n",
|
||
" display(Markdown(f\"- R[{i}]: “{r}”\"))\n",
|
||
"display(Markdown(\"**Source sets (S):**\"))\n",
|
||
"for j, S in enumerate(source_sets):\n",
|
||
" display(Markdown(f\"- S[{j}]: “{' | '.join(S)}”\"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a50b350a",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 1. Tokenization\n",
|
||
"Tokenize each element of R and each S using Jaccard Similarity (whitespace tokens)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "55e7b5d0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"**Tokenized Reference set (R):**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of R[0]: {'Ave', 'MA', '77', 'Boston', 'Mass'}"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of R[1]: {'5th', 'Seattle', 'St', 'WA', '02115'}"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of R[2]: {'77', '5th', 'IL', 'St', 'Chicago'}"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"**Tokenized Source sets (S):**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of S[0]: [{'Ave', 'Boston', 'St', 'Mass', '02115'}, {'77', 'Boston', '5th', 'St', 'Mass'}, {'Ave', '77', '5th', 'Mass', '02115'}]"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of S[1]: [{'Boston', 'MA', '77'}, {'77', 'Boston', '5th', 'St', '02115'}, {'Ave', '77', 'Seattle', 'Mass', '02115'}]"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of S[2]: [{'Ave', 'MA', '77', 'Boston', '5th', 'Mass'}, {'IL', 'Ave', 'Mass', 'Chicago'}, {'St', 'Ave', 'Mass', '77'}]"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Tokens of S[3]: [{'Ave', 'Mass', '77', 'MA'}, {'5th', 'Seattle', 'St', 'WA', '02115'}, {'77', 'Boston', '5th', 'Seattle', 'St'}]"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"tokenizer = Tokenizer(jaccard_similarity, q)\n",
|
||
"tokenized_R = tokenizer.tokenize(reference_set)\n",
|
||
"tokenized_S = [tokenizer.tokenize(S) for S in source_sets]\n",
|
||
"\n",
|
||
"display(Markdown(\"**Tokenized Reference set (R):**\"))\n",
|
||
"for i, toks in enumerate(tokenized_R):\n",
|
||
" display(Markdown(f\"- Tokens of R[{i}]: {toks}\"))\n",
|
||
"\n",
|
||
"display(Markdown(\"**Tokenized Source sets (S):**\"))\n",
|
||
"for i, toks in enumerate(tokenized_S):\n",
|
||
" display(Markdown(f\"- Tokens of S[{i}]: {toks}\"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e17b807b",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 2. Build Inverted Index\n",
|
||
"Builds an inverted index on the tokenized source sets and shows an example lookup."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "22c7d1d6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Index built over 4 source sets."
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Example: token “Mass” appears in [(0, 0), (0, 1), (0, 2), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0)]"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"index = InvertedIndex(tokenized_S)\n",
|
||
"display(Markdown(f\"- Index built over {len(source_sets)} source sets.\"))\n",
|
||
"display(Markdown(f\"- Example: token “Mass” appears in {index.get_indexes('Mass')}\"))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "cc17daac",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 3. Signature Generation"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "1c48bac2",
|
||
"metadata": {},
|
||
"source": [
|
||
"Generates the weighted signature for R given δ, α (here α=0), using Jaccard Similarity."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "a36be65c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Selected signature tokens: **['Chicago', 'WA', 'IL', '5th']**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"sig_gen = SignatureGenerator()\n",
|
||
"signature = sig_gen.get_signature(\n",
|
||
" tokenized_R, index,\n",
|
||
" delta=δ, alpha=α,\n",
|
||
" sig_type=SigType.WEIGHTED,\n",
|
||
" sim_fun=jaccard_similarity,\n",
|
||
" q=q\n",
|
||
")\n",
|
||
"display(Markdown(f\"- Selected signature tokens: **{signature}**\"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "938be3e2",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 4. Initial Candidate Selection\n",
|
||
"\n",
|
||
"Looks up each signature token in the inverted index to form the candidate set.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "58017e27",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Candidate set indices: **[0, 1, 2, 3]**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" - S[0]: “Mass Ave St Boston 02115 | 77 Mass 5th St Boston | 77 Mass Ave 5th 02115”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" - S[1]: “77 Boston MA | 77 5th St Boston 02115 | 77 Mass Ave 02115 Seattle”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" - S[2]: “77 Mass Ave 5th Boston MA | Mass Ave Chicago IL | 77 Mass Ave St”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"cand_sel = CandidateSelector(\n",
|
||
" similarity_func=jaccard_similarity,\n",
|
||
" sim_metric=contain,\n",
|
||
" related_thresh=δ,\n",
|
||
" sim_thresh=α,\n",
|
||
" q=q\n",
|
||
")\n",
|
||
"\n",
|
||
"initial_cands = cand_sel.get_candidates(signature, index, len(tokenized_R))\n",
|
||
"display(Markdown(f\"- Candidate set indices: **{sorted(initial_cands)}**\"))\n",
|
||
"for j in sorted(initial_cands):\n",
|
||
" display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d633e5f9",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 5. Check Filter\n",
|
||
"Prunes candidates by ensuring each matched element passes the local similarity bound.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "9a2bfdeb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"**Surviving after check filter:** **[0, 1, 3]**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"S[0] matched:"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" → Best sim: **0.429** | Matched elements: **1**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"S[1] matched:"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" → Best sim: **0.429** | Matched elements: **1**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"S[3] matched:"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" • R[1] “5th St 02115 Seattle WA” → sim = 1.000"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" • R[2] “77 5th St Chicago IL” → sim = 0.429"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" → Best sim: **1.000** | Matched elements: **2**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"filtered_cands, match_map = cand_sel.check_filter(\n",
|
||
" tokenized_R, set(signature), initial_cands, index\n",
|
||
")\n",
|
||
"display(Markdown(f\"**Surviving after check filter:** **{sorted(filtered_cands)}**\"))\n",
|
||
"for j in sorted(filtered_cands):\n",
|
||
" display(Markdown(f\"S[{j}] matched:\"))\n",
|
||
" for r_idx, sim in match_map[j].items():\n",
|
||
" sim_text = f\"{sim:.3f}\"\n",
|
||
" display(Markdown(f\" • R[{r_idx}] “{reference_set[r_idx]}” → sim = {sim_text}\"))\n",
|
||
" \n",
|
||
" matches = match_map.get(j, {})\n",
|
||
" if matches:\n",
|
||
" best_sim = max(matches.values())\n",
|
||
" num_matches = len(matches)\n",
|
||
" display(Markdown(f\" → Best sim: **{best_sim:.3f}** | Matched elements: **{num_matches}**\"))\n",
|
||
" else:\n",
|
||
" display(Markdown(f\"No elements passed similarity checks.\"))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "cc37bb7f",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 6. Nearest‑Neighbor Filter\n",
|
||
"\n",
|
||
"Further prunes via nearest‑neighbor upper bounds on total matching score.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "aa9b7a63",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"- Surviving after NN filter: **[3]**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" - S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle”"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"nn_filtered = cand_sel.nn_filter(\n",
|
||
" tokenized_R, set(signature), filtered_cands,\n",
|
||
" index, threshold=δ, match_map=match_map\n",
|
||
")\n",
|
||
"display(Markdown(f\"- Surviving after NN filter: **{sorted(nn_filtered)}**\"))\n",
|
||
"for j in nn_filtered:\n",
|
||
" display(Markdown(f\" - S[{j}]: “{' | '.join(source_sets[j])}”\"))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8638f83a",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 7. Verification\n",
|
||
"\n",
|
||
"Runs the bipartite max‑matching on the remaining candidates and outputs the final related sets.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "ebdf20fe",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
"Final related sets (score ≥ 0.7):"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/markdown": [
|
||
" • S[3]: “77 Mass Ave MA | 5th St 02115 Seattle WA | 77 5th St Boston Seattle” → **0.743**"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.Markdown object>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"verifier = Verifier(δ, contain, jaccard_similarity, sim_thresh=α, reduction=False)\n",
|
||
"results = verifier.get_related_sets(tokenized_R, nn_filtered, index)\n",
|
||
"\n",
|
||
"if results:\n",
|
||
" display(Markdown(f\"Final related sets (score ≥ {δ}):\"))\n",
|
||
" for j, score in results:\n",
|
||
" display(Markdown(f\" • S[{j}]: “{' | '.join(source_sets[j])}” → **{score:.3f}**\"))\n",
|
||
"else:\n",
|
||
" display(Markdown(\"- No sets passed verification.\"))\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "silkmoth_env",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|