import json import os import threading import urllib.request import urllib.parse import boto3 # Cache bot token (fetched once at Lambda init) _bot_token: str | None = None _token_lock = threading.Lock() TEXT_EXTENSIONS = {'.txt', '.py', '.js', '.ts', '.json', '.md', '.csv', '.xml', '.html', '.css', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.sh', '.bash', '.sql', '.log', '.env', '.rs', '.go', '.java', '.c', '.h', '.cpp'} MAX_INLINE_SIZE = 50 * 1024 # 50KB def get_bot_token() -> str: global _bot_token if _bot_token is None: with _token_lock: if _bot_token is None: sm = boto3.client('secretsmanager') _bot_token = sm.get_secret_value( SecretId=os.environ['TELEGRAM_BOT_TOKEN_SECRET_ARN'] )['SecretString'] return _bot_token def send_typing(chat_id: str, thread_id: int | None = None) -> None: """Fire-and-forget typing action (does not raise on failure).""" try: token = get_bot_token() payload: dict = {'chat_id': chat_id, 'action': 'typing'} if thread_id is not None: payload['message_thread_id'] = thread_id data = json.dumps(payload).encode() req = urllib.request.Request( f'https://api.telegram.org/bot{token}/sendChatAction', data=data, headers={'Content-Type': 'application/json'}, ) urllib.request.urlopen(req, timeout=3) except Exception: pass # typing is best-effort def get_file_from_telegram(file_id: str) -> tuple[str, bytes]: """Call getFile then download. Returns (file_path, file_bytes).""" token = get_bot_token() # getFile url = f'https://api.telegram.org/bot{token}/getFile' data = json.dumps({'file_id': file_id}).encode() req = urllib.request.Request(url, data=data, headers={'Content-Type': 'application/json'}) with urllib.request.urlopen(req, timeout=30) as resp: result = json.loads(resp.read()).get('result', {}) file_path = result.get('file_path', '') # Download download_url = f'https://api.telegram.org/file/bot{token}/{file_path}' with urllib.request.urlopen(download_url, timeout=60) as resp: file_bytes = resp.read() return file_path, file_bytes def extract_attachment(message: dict) -> dict | None: """Extract file attachment info from a Telegram message. Returns metadata dict or None.""" # Priority: document > photo > audio > video > voice > video_note if 'document' in message: doc = message['document'] return {'type': 'document', 'file_id': doc['file_id'], 'file_name': doc.get('file_name', 'document'), 'mime_type': doc.get('mime_type', ''), 'file_size': doc.get('file_size', 0)} if 'photo' in message: # Take largest photo (last in array) photo = message['photo'][-1] return {'type': 'photo', 'file_id': photo['file_id'], 'file_name': 'photo.jpg', 'mime_type': 'image/jpeg', 'file_size': photo.get('file_size', 0)} if 'audio' in message: audio = message['audio'] return {'type': 'audio', 'file_id': audio['file_id'], 'file_name': audio.get('file_name', 'audio.ogg'), 'mime_type': audio.get('mime_type', 'audio/ogg'), 'file_size': audio.get('file_size', 0)} if 'video' in message: video = message['video'] return {'type': 'video', 'file_id': video['file_id'], 'file_name': video.get('file_name', 'video.mp4'), 'mime_type': video.get('mime_type', 'video/mp4'), 'file_size': video.get('file_size', 0)} if 'voice' in message: voice = message['voice'] return {'type': 'voice', 'file_id': voice['file_id'], 'file_name': 'voice.ogg', 'mime_type': voice.get('mime_type', 'audio/ogg'), 'file_size': voice.get('file_size', 0)} return None def is_text_file(file_name: str, mime_type: str) -> bool: """Determine if a file should be inlined as text.""" ext = os.path.splitext(file_name)[1].lower() if ext in TEXT_EXTENSIONS: return True if mime_type.startswith('text/'): return True return False def handler(event, context): # ── Validate Telegram webhook secret ────────────────────────────────── expected_secret = os.environ.get('TELEGRAM_WEBHOOK_SECRET', '') if expected_secret: headers = event.get('headers') or {} received = headers.get('x-telegram-bot-api-secret-token', '') if received != expected_secret: return {'statusCode': 403, 'body': 'Forbidden'} # ── Parse Telegram Update ───────────────────────────────────────────── try: body = json.loads(event.get('body', '{}')) except json.JSONDecodeError: print(f'[tg-ingest] Bad JSON body') return {'statusCode': 400, 'body': 'Bad Request'} print(f'[tg-ingest] Update keys: {list(body.keys())}') update_id = body.get('update_id') # Support regular messages and edited messages message = body.get('message') or body.get('edited_message') if not message: print(f'[tg-ingest] No message field, update_type={list(body.keys())}') return {'statusCode': 200, 'body': 'ok'} chat_id = str(message.get('chat', {}).get('id', '')) message_thread_id = message.get('message_thread_id') # present for supergroup topics text = message.get('text', '') or message.get('caption', '') from_user = message.get('from', {}) timestamp = message.get('date', 0) # ── Detect file attachment ──────────────────────────────────────────── attachment = extract_attachment(message) attachment_meta = None if attachment: print(f'[tg-ingest] Attachment detected: type={attachment["type"]} name={attachment["file_name"]} size={attachment["file_size"]}') try: file_path, file_bytes = get_file_from_telegram(attachment['file_id']) file_name = attachment['file_name'] mime_type = attachment['mime_type'] if is_text_file(file_name, mime_type) and len(file_bytes) <= MAX_INLINE_SIZE: # Inline small text files try: text_content = file_bytes.decode('utf-8') except UnicodeDecodeError: text_content = file_bytes.decode('latin-1') attachment_meta = { 'type': attachment['type'], 'file_name': file_name, 'mime_type': mime_type, 'inline_content': text_content, } else: # Store to S3 bucket = os.environ.get('ATTACHMENTS_BUCKET_NAME', '') if bucket: s3 = boto3.client('s3') s3_key = f'attachments/{chat_id}/{update_id}/{file_name}' s3.put_object(Bucket=bucket, Key=s3_key, Body=file_bytes, ContentType=mime_type or 'application/octet-stream') attachment_meta = { 'type': attachment['type'], 'file_name': file_name, 'mime_type': mime_type, 's3_bucket': bucket, 's3_key': s3_key, } print(f'[tg-ingest] Stored to s3://{bucket}/{s3_key}') else: print(f'[tg-ingest] No ATTACHMENTS_BUCKET_NAME configured, skipping S3 upload') attachment_meta = { 'type': attachment['type'], 'file_name': file_name, 'mime_type': mime_type, 'error': 'S3 bucket not configured', } except Exception as e: print(f'[tg-ingest] Failed to process attachment: {e}') attachment_meta = { 'type': attachment['type'], 'file_name': attachment['file_name'], 'error': str(e), } print(f'[tg-ingest] chat_id={chat_id} text_len={len(text)} attachment={bool(attachment_meta)} update_id={update_id}') if not chat_id or (not text and not attachment_meta): print(f'[tg-ingest] Dropping: chat_id={chat_id!r} text={text!r} attachment={attachment_meta}') return {'statusCode': 200, 'body': 'ok'} # ── Send typing action (non-blocking, background thread) ────────────── t = threading.Thread(target=send_typing, args=(chat_id, message_thread_id)) t.daemon = True t.start() # ── Enqueue to SQS FIFO ─────────────────────────────────────────────── sqs = boto3.client('sqs') msg_body: dict = { 'channel': 'telegram', 'chat_id': chat_id, 'message_thread_id': message_thread_id, 'messages': [{ 'text': text, 'from_id': str(from_user.get('id', '')), 'from_username': from_user.get('username', ''), 'from_name': f"{from_user.get('first_name', '')} {from_user.get('last_name', '')}".strip(), }], 'update_id': update_id, 'timestamp': timestamp, } if attachment_meta: msg_body['attachment'] = attachment_meta sqs.send_message( QueueUrl=os.environ['MESSAGE_QUEUE_URL'], MessageGroupId=chat_id, MessageDeduplicationId=str(update_id), MessageBody=json.dumps(msg_body), ) return {'statusCode': 200, 'body': 'ok'}