diff --git a/omniparse/chunking/__init__.py b/omniparse/chunking/__init__.py
index 66429f0..748c6d8 100644
--- a/omniparse/chunking/__init__.py
+++ b/omniparse/chunking/__init__.py
@@ -111,3 +111,160 @@ def chunk(self, text: str) -> list:
for i in range(0, len(words), self.step):
chunks.append(" ".join(words[i : i + self.window_size]))
return chunks
+
+
+# Structural cue based chunking
+class StructuralCueChunking(ChunkingStrategy):
+ """
+ Inspired by https://jina.ai/tokenizer/#chunking which leverage common structural cues
+ and build a set of rules and heuristics which should perform exceptionally well across
+ diverse types of content, including Markdown, HTML, LaTeX, and more,
+ ensuring accurate segmentation of text into meaningful chunks.
+
+ Reference: https://gist.github.com/JeremiahZhang/2f8ae87dad836b25f40c02b8c43d16ec
+ Original x post: https://x.com/JinaAI_/status/1823756993108304135
+ """
+ def __init__(self, max_chunk_size: int=500, **kwargs):
+ """
+ Args:
+ max_chunk_size (int, optional): The maximum size of a chunk. Defaults to 500.
+ **kwargs: Additional keyword arguments.
+
+ Returns:
+ None
+ """
+ import regex
+ self.MAX_TABLE_ROWS = 20
+ self.LOOKAHEAD_RANGE = 100
+ self.MAX_HEADING_LENGTH = 7
+ self.MAX_SENTENCE_LENGTH = 400
+ self.MAX_NESTED_LIST_ITEMS = 6
+ self.MAX_BLOCKQUOTE_LINES = 15
+ self.MAX_NESTED_PARENTHESES = 5
+ self.MAX_LIST_INDENT_SPACES = 7
+ self.MAX_LIST_ITEM_LENGTH = 200
+ self.MAX_TABLE_CELL_LENGTH = 200
+ self.MAX_MATH_BLOCK_LENGTH = 500
+ self.MAX_PARAGRAPH_LENGTH = 1000
+ self.MAX_QUOTED_TEXT_LENGTH = 300
+ self.MAX_INDENTED_CODE_LINES = 20
+ self.MAX_CODE_BLOCK_LENGTH = 1500
+ self.MAX_HTML_TABLE_LENGTH = 2000
+ self.MAX_MATH_INLINE_LENGTH = 100
+ self.MAX_CODE_LANGUAGE_LENGTH = 20
+ self.MIN_HORIZONTAL_RULE_LENGTH = 3
+ self.max_chunk_size = max_chunk_size
+ self.MAX_BLOCKQUOTE_LINE_LENGTH = 200
+ self.MAX_HEADING_CONTENT_LENGTH = 200
+ self.MAX_STANDALONE_LINE_LENGTH = 800
+ self.MAX_HEADING_UNDERLINE_LENGTH = 200
+ self.MAX_HTML_TAG_CONTENT_LENGTH = 1000
+ self.MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100
+ self.MAX_PARENTHETICAL_CONTENT_LENGTH = 200
+ self.MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100
+
+ self.pattern = self.__pattern__()
+
+ def __pattern__(self) -> str:
+
+ # 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)
+ heading_regex = rf"""(?:^(?:[#*=-]{{1,{self.MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{self.MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{self.MAX_HEADING_UNDERLINE_LENGTH}}}|
(?:)?[\s\S]{{0,{self.MAX_CODE_BLOCK_LENGTH}}}?(?:
)?
))"
+
+ # 6. Tables (Markdown, grid tables, and HTML tables, with length constraints)
+ table_regex = rf"(?:(?:^|\r?\n)(?:\|[^\r\n]{{0,{self.MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{self.MAX_TABLE_CELL_LENGTH}}}\|){{0,1}}(?:\r?\n\|[^\r\n]{{0,{self.MAX_TABLE_CELL_LENGTH}}}\|){{0,{self.MAX_TABLE_ROWS}}}"
+ table_regex += rf"|)?(?:(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:
)?(?=\r?\n\r?\n|$))" + + # 12. HTML-like tags and their content (including self-closing tags and attributes, with length constraints) + html_like_regex = rf"(?:<[a-zA-Z][^>]{{0,{self.MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{self.MAX_HTML_TAG_CONTENT_LENGTH}}}?[a-zA-Z]+>|\s*/>))" + + #13. LaTeX-style math expressions (inline and block, with length constraints) + latex_regex = rf"(?:(?:\$\$[\s\S]{{0,{self.MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\$))" + + # 14. Fallback for any remaining content (with length constraints) + fallback_regex = rf"(?:(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))" + + return re.compile('|'.join((f"({heading_regex}", citation_regex, list_item_regex, block_regex, code_block_regex, table_regex, horizontal_rule_regex, single_line_regex, sentence_regex, quoted_text, paragraph_regex, html_like_regex, latex_regex, f"{fallback_regex})")), re.MULTILINE | re.DOTALL) + + def chunk(self, text: str) -> list: + """ + Breaks down a given text into smaller chunks based on common stuctural cues and maximum chunk size. + + Args: + text (str): The input text to be chunked. + + Returns: + list: A list of chunked text, where each chunk is a string. + """ + chunks = re.findall(self.pattern, text) + + temp_chunk = "" + final_chunks = [] + + for chunk in chunks: + chunk=chunk[0] + if len(temp_chunk) + len(chunk) > self.max_chunk_size: + final_chunks.append(temp_chunk.strip()) + temp_chunk = chunk + else: + temp_chunk += chunk + + if temp_chunk: + final_chunks.append(temp_chunk.strip()) + + # If a chunk is too large, break it down further + refined_chunks = [] + for chunk in final_chunks: + if len(chunk) > self.max_chunk_size: + sentences = re.split(r'(?<=[.!?]) +', chunk) # Split by sentence + temp_chunk = "" + for sentence in sentences: + if len(temp_chunk) + len(sentence) > self.max_chunk_size: + refined_chunks.append(temp_chunk.strip()) + temp_chunk = sentence + else: + temp_chunk += f" {sentence}" + if temp_chunk: + refined_chunks.append(temp_chunk.strip()) + else: + refined_chunks.append(chunk) + + return refined_chunks \ No newline at end of file