transcripto/finalproj_upload.py at main · etomlins/transcripto · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#
# Uploads a PDF to S3 and then inserts a new job record
# in the Transcripto database with a status of 'uploaded'.
# Sends the job id back to the client.
#

import json
import boto3
import os
import uuid
import base64
import pathlib
import datatier
import tempfile

from configparser import ConfigParser

def lambda_handler(event, context):
  try:
    print("**STARTING**")
    print("**lambda: final proj upload**")

    #
    # setup AWS based on config file:
    #
    config_file = 'transcripto-app-config.ini'
    os.environ['AWS_SHARED_CREDENTIALS_FILE'] = config_file

    configur = ConfigParser()
    configur.read(config_file)

    #
    # configure for S3 access:
    # #
    # s3_profile = 's3readwrite'
    # boto3.setup_default_session(profile_name=s3_profile)

    bucketname = configur.get('s3', 'bucket_name')

    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucketname)

    #
    # configure for RDS access
    #
    rds_endpoint = configur.get('rds', 'endpoint')
    rds_portnum = int(configur.get('rds', 'port_number'))
    rds_username = configur.get('rds', 'user_name')
    rds_pwd = configur.get('rds', 'user_pwd')
    rds_dbname = configur.get('rds', 'db_name')


    #
    # the user has sent us two parameters:
    #  1. filename of their file
    #  2. raw file data in base64 encoded string
    #
    # The parameters are coming through web server
    # (or API Gateway) in the body of the request
    # in JSON format.
    #
    print("**Accessing request body**")

    if "body" not in event:
        raise Exception("event has no body")

    try:
        body = json.loads(event["body"])
        filename = body["filename"]
        datastr = body["data"]
        file_bytes = base64.b64decode(datastr)
    except Exception as e:
        return {
            "statusCode": 400,
            "body": json.dumps({"error": "Invalid request body: must contain base64-encoded 'data' and 'filename'"})
        }

    # open connection to the database:
    #
    print("**Opening connection**")

    dbConn = datatier.get_dbConn(rds_endpoint, rds_portnum, rds_username, rds_pwd, rds_dbname)

    print("**dbconn worked.")
    #
    # upload to s3
    #
    base64_bytes = datastr.encode()        # string -> base64 bytes
    bytes = base64.b64decode(base64_bytes) # base64 bytes -> raw bytes

    #
    # write raw bytes to local filesystem for upload:
    #
    print("**Writing local data file**")
    #
    # TODO #1 of 3: what directory do we write to locally?
    # Then open this local file for writing a binary file,
    # write the bytes we received from the client, and
    # close the file.
    #
    # headers = event.get("headers", {})
    # content_type = headers.get("Content-Type", headers.get("content-type", ""))
    # print("Content-Type:", content_type)

    # if not content_type.startswith("multipart/form-data"):
    #     return {
    #         "statusCode": 400,
    #         "body": json.dumps({"error": "Invalid request. Must be multipart/form-data."})
    #     }
    # Write file to temporary storage

    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=pathlib.Path(filename).suffix)
    temp_file.write(file_bytes)
    temp_file.close()
    local_filename = temp_file.name


    print(f"Saved uploaded file to: {temp_file.name}")

    #
    # generate unique filename in preparation for the S3 upload:
    #
    print("**Uploading local file to S3**")

    basename = pathlib.Path(filename).stem
    extension = pathlib.Path(filename).suffix

    bucketkey = "transcriptoapp/" + basename + "-" + str(uuid.uuid4()) + extension

    print("S3 bucketkey:", bucketkey)

    query_params = event.get("queryStringParameters", {})
    job_type = query_params.get("job_type", "").lower()

    if job_type not in {"transcription", "text_to_speech", "translation"}:
        return {
            "statusCode": 400,
            "body": json.dumps({"error": "Invalid job_type. Must be 'transcription', 'text_to_speech', or 'translation'."})
        }

    #
    # Remember that the processing of the PDF is event-triggered,
    # and that lambda function is going to update the database as
    # is processes. So let's insert a job record into the database
    # first, THEN upload the PDF to S3. The status column should
    # be set to 'uploaded':
    #
    print("**Adding jobs row to database**")

    sql = """
        INSERT INTO jobs (jobtype, status, originaldatafile, datafilekey, resultsfilekey)
        VALUES (%s, 'uploaded', %s, %s, '');
    """
    #
    # TODO #2 of 3: what values should we insert into the database?
    #
    datatier.perform_action(dbConn, sql, [job_type, filename, bucketkey])

    #
    # grab the jobid that was auto-generated by mysql:
    #
    sql = "SELECT LAST_INSERT_ID();"

    row = datatier.retrieve_one_row(dbConn, sql)

    jobid = row[0]

    print("jobid:", jobid)

    #
    # now that DB is updated, let's upload PDF to S3:
    #
    print("**Uploading data file to S3**")

    #
    # TODO #3 of 3: what are we uploading to S3? replace the
    # ??? with what we are uploading:
    #
    bucket.upload_file(
        local_filename,
        bucketkey,
        ExtraArgs={'ACL': 'public-read', 'ContentType': 'application/octet-stream'}
    )

    #
    # respond in an HTTP-like way, i.e. with a status
    # code and body in JSON format:
    #
    print("**DONE, returning jobid**")

    return {
      'statusCode': 200,
      'body': json.dumps(str(jobid))
    }

  except Exception as err:
    print("**ERROR**")
    print(str(err))

    return {
      'statusCode': 500,
      'body': json.dumps(str(err))
    }