Python Samples
Extract Folder and File Names to Multi-Sheet Excel
import os
import pandas as pd
def extract_folder_file_names(root_dir, output_excel):
"""
Extracts folder and file names from a directory and saves them to an Excel file,
splitting data into multiple sheets if the total number of rows exceeds Excel's limit.
Args:
root_dir (str): The path to the root directory.
output_excel (str): The path to the output Excel file (e.g., "output.xlsx").
"""
all_data = []
print(f"Scanning directory: {root_dir}...")
for root, dirs, files in os.walk(root_dir):
for dir_name in dirs:
all_data.append({'Type': 'Folder', 'Name': dir_name, 'Path': os.path.join(root, dir_name)})
for file_name in files:
all_data.append({'Type': 'File', 'Name': file_name, 'Path': os.path.join(root, file_name)})
print(f"Found {len(all_data)} items (folders/files).")
# Define the maximum rows per Excel sheet (1,048,576 total rows, including header)
# So, we can write 1,048,575 data rows per sheet.
MAX_ROWS_PER_SHEET = 1048575
if not all_data:
print("No folders or files found to write to Excel.")
return
# Create an Excel writer object
# The 'with' statement ensures the Excel file is properly closed
with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
# Calculate the number of sheets needed
num_sheets = (len(all_data) + MAX_ROWS_PER_SHEET - 1) // MAX_ROWS_PER_SHEET
print(f"Writing data to {num_sheets} sheet(s) in {output_excel}...")
for i in range(num_sheets):
start_row = i * MAX_ROWS_PER_SHEET
end_row = min((i + 1) * MAX_ROWS_PER_SHEET, len(all_data))
# Get the chunk of data for the current sheet
sheet_data = all_data[start_row:end_row]
# Create a DataFrame for the current sheet's data
df_sheet = pd.DataFrame(sheet_data)
# Define the sheet name
sheet_name = f"Sheet{i + 1}"
# Write the DataFrame to the current sheet
df_sheet.to_excel(writer, sheet_name=sheet_name, index=False)
print(f" - Wrote {len(sheet_data)} rows to '{sheet_name}'.")
print(f"Successfully extracted folder and file names to '{output_excel}'.")
if __name__ == "__main__":
# Example usage:
# root_directory = "C:\\Users\\Kishore\\Desktop\\New"
# output_file = "C:\\Users\\Kishore\\Desktop\\New\\output.xlsx"
root_directory = input("Enter the path to the root directory: ")
output_file = input("Enter the path to the output Excel file (e.g., C:\\path\\to\\file.xlsx): ")
# Basic validation for output file extension
if not output_file.lower().endswith(('.xlsx', '.xls')):
print("Error: Output file must have a .xlsx or .xls extension.")
else:
extract_folder_file_names(root_directory, output_file)
Input Commands & Output
PS D:\Python\Extract> python D:\Python\Extract\main.py
Enter the path to the root directory: G:\
Enter the path to the output Excel file (e.g., C:\path\to\file.xlsx): C:\Users\Kishore\Desktop\New\output.xlsx
Scanning directory: G:\...
Found 1301168 items (folders/files).
Writing data to 2 sheet(s) in C:\Users\Kishore\Desktop\New\output.xlsx...
- Wrote 1048575 rows to 'Sheet1'.
- Wrote 252593 rows to 'Sheet2'.
Successfully extracted folder and file names to 'C:\Users\Kishore\Desktop\New\output.xlsx'.
PS D:\Python\Extract>
Youtube Video Summary and Chapters using Bedrock LLM
# install dependencies first:
# pip install youtube-transcript-api openai pytube
import boto3
import json
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
# Configure AWS Bedrock client
bedrock_runtime = boto3.client(
"bedrock-runtime",
region_name="us-east-1" # Use the region where Llama 3.3 70B is available
)
MODEL_ID = "anthropic.claude-v2" # Llama 3.3 70B Instruct
def extract_video_id(url):
query = urlparse(url)
if query.hostname == 'youtu.be':
return query.path[1:]
elif query.hostname in ('www.youtube.com', 'youtube.com'):
if query.path == '/watch':
return parse_qs(query.query)['v'][0]
elif query.path[:7] == '/embed/':
return query.path.split('/')[2]
elif query.path[:3] == '/v/':
return query.path.split('/')[2]
return None
def get_transcript(video_id):
transcript = YouTubeTranscriptApi.get_transcript(video_id)
#print("full_text_transcript", transcript)
full_text = ' '.join([entry['text'] for entry in transcript])
#print("full_text", full_text)
return full_text
def summarize_and_chapterize(transcript):
system_prompt = (
"You are an expert summarizer.\n\n"
"TASK:\n"
"1. Summarize the following video transcript.\n"
"2. Break the content into well-titled chapters with timestamps (assume even spacing).\n"
"3. Format output like this:\n\n"
"### Summary\n"
"<"Summary here">\n\n"
"### Chapters\n"
"- 00:00 - Introduction\n"
"- 02:13 - Topic 1: ...\n"
"- 05:47 - Topic 2: ...\n\n"
"TRANSCRIPT:\n"
)
prompt = f"{system_prompt}{transcript[:7000]}" # Claude v2 can handle more, but keep conservative
body = {
"prompt": f"\n\nHuman: {prompt}\n\nAssistant:",
"max_tokens_to_sample": 2048,
"temperature": 0.5,
"top_k": 250,
"top_p": 0.9,
"stop_sequences": ["\n\nHuman:"]
}
response = bedrock_runtime.invoke_model(
modelId=MODEL_ID,
contentType="application/json",
accept="application/json",
body=json.dumps(body)
)
result = json.loads(response['body'].read())
return result['completion']
def summarize_youtube_video(url):
video_id = extract_video_id(url)
if not video_id:
return "Invalid YouTube URL"
print(f"[+] Fetching transcript for video ID: {video_id}")
transcript = get_transcript(video_id)
print("transcript", transcript)
print("[+] Sending to LLM for summary and chapter creation...")
result = summarize_and_chapterize(transcript)
print("result", result)
return result
#return transcript
# Example usage
if __name__ == "__main__":
url = input("Enter YouTube video URL: ")
output = summarize_youtube_video(url)
print("\n--- Generated Summary and Chapters ---\n")
print(output)