230 lines
9.8 KiB
Python
230 lines
9.8 KiB
Python
import json
|
|
import os
|
|
import threading
|
|
import urllib.request
|
|
import urllib.parse
|
|
import boto3
|
|
|
|
# Cache bot token (fetched once at Lambda init)
|
|
_bot_token: str | None = None
|
|
_token_lock = threading.Lock()
|
|
|
|
TEXT_EXTENSIONS = {'.txt', '.py', '.js', '.ts', '.json', '.md', '.csv', '.xml', '.html',
|
|
'.css', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.sh', '.bash',
|
|
'.sql', '.log', '.env', '.rs', '.go', '.java', '.c', '.h', '.cpp'}
|
|
MAX_INLINE_SIZE = 50 * 1024 # 50KB
|
|
|
|
|
|
def get_bot_token() -> str:
|
|
global _bot_token
|
|
if _bot_token is None:
|
|
with _token_lock:
|
|
if _bot_token is None:
|
|
sm = boto3.client('secretsmanager')
|
|
_bot_token = sm.get_secret_value(
|
|
SecretId=os.environ['TELEGRAM_BOT_TOKEN_SECRET_ARN']
|
|
)['SecretString']
|
|
return _bot_token
|
|
|
|
|
|
def send_typing(chat_id: str, thread_id: int | None = None) -> None:
|
|
"""Fire-and-forget typing action (does not raise on failure)."""
|
|
try:
|
|
token = get_bot_token()
|
|
payload: dict = {'chat_id': chat_id, 'action': 'typing'}
|
|
if thread_id is not None:
|
|
payload['message_thread_id'] = thread_id
|
|
data = json.dumps(payload).encode()
|
|
req = urllib.request.Request(
|
|
f'https://api.telegram.org/bot{token}/sendChatAction',
|
|
data=data,
|
|
headers={'Content-Type': 'application/json'},
|
|
)
|
|
urllib.request.urlopen(req, timeout=3)
|
|
except Exception:
|
|
pass # typing is best-effort
|
|
|
|
|
|
def get_file_from_telegram(file_id: str) -> tuple[str, bytes]:
|
|
"""Call getFile then download. Returns (file_path, file_bytes)."""
|
|
token = get_bot_token()
|
|
# getFile
|
|
url = f'https://api.telegram.org/bot{token}/getFile'
|
|
data = json.dumps({'file_id': file_id}).encode()
|
|
req = urllib.request.Request(url, data=data, headers={'Content-Type': 'application/json'})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
result = json.loads(resp.read()).get('result', {})
|
|
file_path = result.get('file_path', '')
|
|
# Download
|
|
download_url = f'https://api.telegram.org/file/bot{token}/{file_path}'
|
|
with urllib.request.urlopen(download_url, timeout=60) as resp:
|
|
file_bytes = resp.read()
|
|
return file_path, file_bytes
|
|
|
|
|
|
def extract_attachment(message: dict) -> dict | None:
|
|
"""Extract file attachment info from a Telegram message. Returns metadata dict or None."""
|
|
# Priority: document > photo > audio > video > voice > video_note
|
|
if 'document' in message:
|
|
doc = message['document']
|
|
return {'type': 'document', 'file_id': doc['file_id'],
|
|
'file_name': doc.get('file_name', 'document'), 'mime_type': doc.get('mime_type', ''),
|
|
'file_size': doc.get('file_size', 0)}
|
|
if 'photo' in message:
|
|
# Take largest photo (last in array)
|
|
photo = message['photo'][-1]
|
|
return {'type': 'photo', 'file_id': photo['file_id'],
|
|
'file_name': 'photo.jpg', 'mime_type': 'image/jpeg',
|
|
'file_size': photo.get('file_size', 0)}
|
|
if 'audio' in message:
|
|
audio = message['audio']
|
|
return {'type': 'audio', 'file_id': audio['file_id'],
|
|
'file_name': audio.get('file_name', 'audio.ogg'), 'mime_type': audio.get('mime_type', 'audio/ogg'),
|
|
'file_size': audio.get('file_size', 0)}
|
|
if 'video' in message:
|
|
video = message['video']
|
|
return {'type': 'video', 'file_id': video['file_id'],
|
|
'file_name': video.get('file_name', 'video.mp4'), 'mime_type': video.get('mime_type', 'video/mp4'),
|
|
'file_size': video.get('file_size', 0)}
|
|
if 'voice' in message:
|
|
voice = message['voice']
|
|
return {'type': 'voice', 'file_id': voice['file_id'],
|
|
'file_name': 'voice.ogg', 'mime_type': voice.get('mime_type', 'audio/ogg'),
|
|
'file_size': voice.get('file_size', 0)}
|
|
return None
|
|
|
|
|
|
def is_text_file(file_name: str, mime_type: str) -> bool:
|
|
"""Determine if a file should be inlined as text."""
|
|
ext = os.path.splitext(file_name)[1].lower()
|
|
if ext in TEXT_EXTENSIONS:
|
|
return True
|
|
if mime_type.startswith('text/'):
|
|
return True
|
|
return False
|
|
|
|
|
|
def handler(event, context):
|
|
# ── Validate Telegram webhook secret ──────────────────────────────────
|
|
expected_secret = os.environ.get('TELEGRAM_WEBHOOK_SECRET', '')
|
|
if expected_secret:
|
|
headers = event.get('headers') or {}
|
|
received = headers.get('x-telegram-bot-api-secret-token', '')
|
|
if received != expected_secret:
|
|
return {'statusCode': 403, 'body': 'Forbidden'}
|
|
|
|
# ── Parse Telegram Update ─────────────────────────────────────────────
|
|
try:
|
|
body = json.loads(event.get('body', '{}'))
|
|
except json.JSONDecodeError:
|
|
print(f'[tg-ingest] Bad JSON body')
|
|
return {'statusCode': 400, 'body': 'Bad Request'}
|
|
|
|
print(f'[tg-ingest] Update keys: {list(body.keys())}')
|
|
update_id = body.get('update_id')
|
|
|
|
# Support regular messages and edited messages
|
|
message = body.get('message') or body.get('edited_message')
|
|
if not message:
|
|
print(f'[tg-ingest] No message field, update_type={list(body.keys())}')
|
|
return {'statusCode': 200, 'body': 'ok'}
|
|
|
|
chat_id = str(message.get('chat', {}).get('id', ''))
|
|
message_thread_id = message.get('message_thread_id') # present for supergroup topics
|
|
text = message.get('text', '') or message.get('caption', '')
|
|
from_user = message.get('from', {})
|
|
timestamp = message.get('date', 0)
|
|
|
|
# ── Detect file attachment ────────────────────────────────────────────
|
|
attachment = extract_attachment(message)
|
|
attachment_meta = None
|
|
|
|
if attachment:
|
|
print(f'[tg-ingest] Attachment detected: type={attachment["type"]} name={attachment["file_name"]} size={attachment["file_size"]}')
|
|
try:
|
|
file_path, file_bytes = get_file_from_telegram(attachment['file_id'])
|
|
file_name = attachment['file_name']
|
|
mime_type = attachment['mime_type']
|
|
|
|
if is_text_file(file_name, mime_type) and len(file_bytes) <= MAX_INLINE_SIZE:
|
|
# Inline small text files
|
|
try:
|
|
text_content = file_bytes.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
text_content = file_bytes.decode('latin-1')
|
|
attachment_meta = {
|
|
'type': attachment['type'],
|
|
'file_name': file_name,
|
|
'mime_type': mime_type,
|
|
'inline_content': text_content,
|
|
}
|
|
else:
|
|
# Store to S3
|
|
bucket = os.environ.get('ATTACHMENTS_BUCKET_NAME', '')
|
|
if bucket:
|
|
s3 = boto3.client('s3')
|
|
s3_key = f'attachments/{chat_id}/{update_id}/{file_name}'
|
|
s3.put_object(Bucket=bucket, Key=s3_key, Body=file_bytes,
|
|
ContentType=mime_type or 'application/octet-stream')
|
|
attachment_meta = {
|
|
'type': attachment['type'],
|
|
'file_name': file_name,
|
|
'mime_type': mime_type,
|
|
's3_bucket': bucket,
|
|
's3_key': s3_key,
|
|
}
|
|
print(f'[tg-ingest] Stored to s3://{bucket}/{s3_key}')
|
|
else:
|
|
print(f'[tg-ingest] No ATTACHMENTS_BUCKET_NAME configured, skipping S3 upload')
|
|
attachment_meta = {
|
|
'type': attachment['type'],
|
|
'file_name': file_name,
|
|
'mime_type': mime_type,
|
|
'error': 'S3 bucket not configured',
|
|
}
|
|
except Exception as e:
|
|
print(f'[tg-ingest] Failed to process attachment: {e}')
|
|
attachment_meta = {
|
|
'type': attachment['type'],
|
|
'file_name': attachment['file_name'],
|
|
'error': str(e),
|
|
}
|
|
|
|
print(f'[tg-ingest] chat_id={chat_id} text_len={len(text)} attachment={bool(attachment_meta)} update_id={update_id}')
|
|
|
|
if not chat_id or (not text and not attachment_meta):
|
|
print(f'[tg-ingest] Dropping: chat_id={chat_id!r} text={text!r} attachment={attachment_meta}')
|
|
return {'statusCode': 200, 'body': 'ok'}
|
|
|
|
# ── Send typing action (non-blocking, background thread) ──────────────
|
|
t = threading.Thread(target=send_typing, args=(chat_id, message_thread_id))
|
|
t.daemon = True
|
|
t.start()
|
|
|
|
# ── Enqueue to SQS FIFO ───────────────────────────────────────────────
|
|
sqs = boto3.client('sqs')
|
|
msg_body: dict = {
|
|
'channel': 'telegram',
|
|
'chat_id': chat_id,
|
|
'message_thread_id': message_thread_id,
|
|
'messages': [{
|
|
'text': text,
|
|
'from_id': str(from_user.get('id', '')),
|
|
'from_username': from_user.get('username', ''),
|
|
'from_name': f"{from_user.get('first_name', '')} {from_user.get('last_name', '')}".strip(),
|
|
}],
|
|
'update_id': update_id,
|
|
'timestamp': timestamp,
|
|
}
|
|
if attachment_meta:
|
|
msg_body['attachment'] = attachment_meta
|
|
|
|
sqs.send_message(
|
|
QueueUrl=os.environ['MESSAGE_QUEUE_URL'],
|
|
MessageGroupId=chat_id,
|
|
MessageDeduplicationId=str(update_id),
|
|
MessageBody=json.dumps(msg_body),
|
|
)
|
|
|
|
return {'statusCode': 200, 'body': 'ok'}
|