Spaces:

zman1x1
/

yt-video-to-summary

Sleeping

App Files Files Community

yt-video-to-summary / utils /chunk.py

zman1x1

Upload 21 files

3456a58 about 1 year ago

raw

history blame contribute delete

4.61 kB

	# divide the subs into chunks for more accurate summarization
	# TODO: divide the subs into chunks based on the topics
	# summarize each chunk and add it to the markdown file
	from rich.progress import track

	class legacy_chunker:
	# legacy manual chunker
	def __init__(self, text):
	self.text = text
	def chunker(self, size=1000):
	words = self.text.split()
	chunks = []
	current_chunk = ""
	for word in words:
	if len(current_chunk) + len(word) + 1 <= size:
	current_chunk += f"{word} "
	else:
	chunks.append(current_chunk.strip())
	current_chunk = f"{word} "
	if current_chunk:
	chunks.append(current_chunk.strip())
	return chunks


	def __sizeof__(self) -> int:
	count = 0
	for _ in self.text:
	count += 1
	return count

	class LangChainChunker:
	def __init__(self, text):
	self.text = text

	def chunker(self, size=1000):
	from langchain.text_splitter import CharacterTextSplitter

	# attach the duration of the video to the chunk
	# [[chunk, duration]]

	text_splitter = CharacterTextSplitter(
	separator=" ",
	chunk_size=size,
	chunk_overlap=0.9,
	)

	return text_splitter.split_text(self.text)

	def __sizeof__(self) -> int:
	count = 0
	for _ in self.text:
	count += 1
	return count

	def ChunkByChapters(chapters: list, subs: list, size=1000):
	"""Chunk the youtube video subtitles based on the chapters

	Args:
	chapters (list): Chapters from yt api
	subs (list): subtitles from yt api
	size (int, optional): _description_. Defaults to 1000.

	Raises:
	Exception: No chapters found

	Returns:
	list : structure chunk_dict = {
	"chapter1": [
	[chunk1, chunk2, chunk3, ...],
	[chunk1_duration, chunk2_duration, chunk3_duration, ...]
	],
	...
	}
	"""
	chunks = []
	chunk_dict = {}

	# format chapters for chunking
	Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]

	if len(chapters) == 0:
	raise Exception("No chapters found")
	else:

	## STEP 1:
	# chapters timestamp is set to beggining of chapter
	# to process all chapter subs instead of always checking if the sub is in the chapter
	# its easier to set the timestamp to end of chapter
	# set timestamp to last second of chapter
	for c in range(len(Fchapters)-1):
	if c == len(Fchapters):
	break
	Fchapters[c][1] = Fchapters[c+1][1] - 1


	## STEP 2: chunking based on chapters
	# for each chapter, chunk the subs
	# and add the chunk to the chunk_dict
	#
	# chunk_dict = {
	# "chapter1": [
	# [chunk1, chunk2, chunk3, ...],
	# [chunk1_duration, chunk2_duration, chunk3_duration, ...]
	# ],
	# ...
	# }
	#

	for c in track(
	range(len(Fchapters)-1),
	description="Chunking by chapters: "
	):
	title = Fchapters[c][0]

	# set the start and end of the chapter
	start = 0 if c == 0 else Fchapters[c-1][1]+1
	end = Fchapters[c][1]

	current_chunk = ""

	## STEP 2 (a): process the subs
	# for each sub, check if it is in the chapter
	# if it is, add it to the current chunk

	for sublinedata in subs:
	cstart: int = sublinedata['start']
	subline: str = sublinedata['text']

	if cstart < start:
	continue
	if cstart >= end:
	break

	total_size = len(current_chunk) + len(subline)
	if total_size + 1 < size:
	current_chunk += subline
	else:
	chunks.append(
	[
	[current_chunk.strip()],
	[cstart],
	]
	)
	current_chunk = ""

	chunk_dict.update({title: chunks})
	chunks = []

	return chunk_dict