Python Samples
Extract Folder and File Names to Multi-Sheet Excel
import os import pandas as pd def extract_folder_file_names(root_dir, output_excel): """ Extracts folder and file names from a directory and saves them to an Excel file, splitting data into multiple sheets if the total number of rows exceeds Excel's limit. Args: root_dir (str): The path to the root directory. output_excel (str): The path to the output Excel file (e.g., "output.xlsx"). """ all_data = [] print(f"Scanning directory: {root_dir}...") for root, dirs, files in os.walk(root_dir): for dir_name in dirs: all_data.append({'Type': 'Folder', 'Name': dir_name, 'Path': os.path.join(root, dir_name)}) for file_name in files: all_data.append({'Type': 'File', 'Name': file_name, 'Path': os.path.join(root, file_name)}) print(f"Found {len(all_data)} items (folders/files).") # Define the maximum rows per Excel sheet (1,048,576 total rows, including header) # So, we can write 1,048,575 data rows per sheet. MAX_ROWS_PER_SHEET = 1048575 if not all_data: print("No folders or files found to write to Excel.") return # Create an Excel writer object # The 'with' statement ensures the Excel file is properly closed with pd.ExcelWriter(output_excel, engine='openpyxl') as writer: # Calculate the number of sheets needed num_sheets = (len(all_data) + MAX_ROWS_PER_SHEET - 1) // MAX_ROWS_PER_SHEET print(f"Writing data to {num_sheets} sheet(s) in {output_excel}...") for i in range(num_sheets): start_row = i * MAX_ROWS_PER_SHEET end_row = min((i + 1) * MAX_ROWS_PER_SHEET, len(all_data)) # Get the chunk of data for the current sheet sheet_data = all_data[start_row:end_row] # Create a DataFrame for the current sheet's data df_sheet = pd.DataFrame(sheet_data) # Define the sheet name sheet_name = f"Sheet{i + 1}" # Write the DataFrame to the current sheet df_sheet.to_excel(writer, sheet_name=sheet_name, index=False) print(f" - Wrote {len(sheet_data)} rows to '{sheet_name}'.") print(f"Successfully extracted folder and file names to '{output_excel}'.") if __name__ == "__main__": # Example usage: # root_directory = "C:\\Users\\Kishore\\Desktop\\New" # output_file = "C:\\Users\\Kishore\\Desktop\\New\\output.xlsx" root_directory = input("Enter the path to the root directory: ") output_file = input("Enter the path to the output Excel file (e.g., C:\\path\\to\\file.xlsx): ") # Basic validation for output file extension if not output_file.lower().endswith(('.xlsx', '.xls')): print("Error: Output file must have a .xlsx or .xls extension.") else: extract_folder_file_names(root_directory, output_file)Youtube Video Summary and Chapters using Bedrock LLMInput Commands & Output PS D:\Python\Extract> python D:\Python\Extract\main.py Enter the path to the root directory:G:\ Enter the path to the output Excel file (e.g., C:\path\to\file.xlsx):C:\Users\Kishore\Desktop\New\output.xlsx Scanning directory: G:\... Found 1301168 items (folders/files). Writing data to 2 sheet(s) in C:\Users\Kishore\Desktop\New\output.xlsx... - Wrote 1048575 rows to 'Sheet1'. - Wrote 252593 rows to 'Sheet2'. Successfully extracted folder and file names to 'C:\Users\Kishore\Desktop\New\output.xlsx'. PS D:\Python\Extract>![]()
# install dependencies first: # pip install youtube-transcript-api openai pytube import boto3 import json from youtube_transcript_api import YouTubeTranscriptApi from urllib.parse import urlparse, parse_qs # Configure AWS Bedrock client bedrock_runtime = boto3.client( "bedrock-runtime", region_name="us-east-1" # Use the region where Llama 3.3 70B is available ) MODEL_ID = "anthropic.claude-v2" # Llama 3.3 70B Instruct def extract_video_id(url): query = urlparse(url) if query.hostname == 'youtu.be': return query.path[1:] elif query.hostname in ('www.youtube.com', 'youtube.com'): if query.path == '/watch': return parse_qs(query.query)['v'][0] elif query.path[:7] == '/embed/': return query.path.split('/')[2] elif query.path[:3] == '/v/': return query.path.split('/')[2] return None def get_transcript(video_id): transcript = YouTubeTranscriptApi.get_transcript(video_id) #print("full_text_transcript", transcript) full_text = ' '.join([entry['text'] for entry in transcript]) #print("full_text", full_text) return full_text def summarize_and_chapterize(transcript): system_prompt = ( "You are an expert summarizer.\n\n" "TASK:\n" "1. Summarize the following video transcript.\n" "2. Break the content into well-titled chapters with timestamps (assume even spacing).\n" "3. Format output like this:\n\n" "### Summary\n" "<"Summary here">\n\n" "### Chapters\n" "- 00:00 - Introduction\n" "- 02:13 - Topic 1: ...\n" "- 05:47 - Topic 2: ...\n\n" "TRANSCRIPT:\n" ) prompt = f"{system_prompt}{transcript[:7000]}" # Claude v2 can handle more, but keep conservative body = { "prompt": f"\n\nHuman: {prompt}\n\nAssistant:", "max_tokens_to_sample": 2048, "temperature": 0.5, "top_k": 250, "top_p": 0.9, "stop_sequences": ["\n\nHuman:"] } response = bedrock_runtime.invoke_model( modelId=MODEL_ID, contentType="application/json", accept="application/json", body=json.dumps(body) ) result = json.loads(response['body'].read()) return result['completion'] def summarize_youtube_video(url): video_id = extract_video_id(url) if not video_id: return "Invalid YouTube URL" print(f"[+] Fetching transcript for video ID: {video_id}") transcript = get_transcript(video_id) print("transcript", transcript) print("[+] Sending to LLM for summary and chapter creation...") result = summarize_and_chapterize(transcript) print("result", result) return result #return transcript # Example usage if __name__ == "__main__": url = input("Enter YouTube video URL: ") output = summarize_youtube_video(url) print("\n--- Generated Summary and Chapters ---\n") print(output)