Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified __pycache__/config.cpython-313.pyc
Binary file not shown.
Binary file modified __pycache__/database.cpython-313.pyc
Binary file not shown.
Binary file modified services/__pycache__/db_connector.cpython-313.pyc
Binary file not shown.
Binary file modified services/__pycache__/ml_mapper.cpython-313.pyc
Binary file not shown.
Binary file modified services/__pycache__/transformers.cpython-313.pyc
Binary file not shown.
11 changes: 10 additions & 1 deletion services/db_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,24 @@ def create_sqlalchemy_engine(db_type, host, port, db_name, user, password, chars

elif db_type == "Microsoft SQL Server":
# Requires: pip install pymssql
# For Thai data: use 'utf8' or 'cp874' (Thai Windows codepage)
# If source contains legacy TIS-620, try 'cp874' charset
mssql_charset = charset if charset else "utf8"

query_params = {"charset": mssql_charset}

# For legacy Thai databases, add TDS version for better compatibility
if charset in ['tis620', 'cp874', 'latin1']:
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition checks if 'charset' is in the list, but 'charset' could be None when default charsets are used (lines 59). This means the TDS version will never be set when using default charset, even if it might be beneficial for older SQL Server versions. Consider checking the actual value of 'mssql_charset' instead of 'charset', or explicitly handle the None case.

Suggested change
if charset in ['tis620', 'cp874', 'latin1']:
if mssql_charset in ['tis620', 'cp874', 'latin1']:

Copilot uses AI. Check for mistakes.
query_params["tds_version"] = "7.0" # Compatible with older SQL Server

connection_url = URL.create(
"mssql+pymssql",
username=user,
password=password,
host=host,
port=port_int or 1433,
database=db_name,
query={"charset": mssql_charset}
query=query_params
)

else:
Expand Down
Binary file modified views/__pycache__/migration_engine.cpython-313.pyc
Binary file not shown.
65 changes: 59 additions & 6 deletions views/migration_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def generate_select_query(config_data, source_table, db_type='MySQL'):
"""
Generate a SELECT query based on configuration.
Applies TRIM at source for MSSQL CHAR columns to prevent padding.
For SQL Server: Also cleans non-breaking spaces and control characters at source.
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment mentions cleaning control characters at source, but the implementation only removes CHAR(0) (null bytes) and CHAR(160) (non-breaking space). Other control characters (CHAR(1)-CHAR(31) excluding newlines/tabs) are not handled. Consider if additional control character cleaning is needed or if the comment should be updated to accurately reflect what is cleaned.

Suggested change
For SQL Server: Also cleans non-breaking spaces and control characters at source.
For SQL Server: Also replaces non-breaking spaces (CHAR(160)) with regular spaces and removes null bytes (CHAR(0)) at source.

Copilot uses AI. Check for mistakes.
"""
try:
if not config_data or 'mappings' not in config_data:
Expand All @@ -65,8 +66,23 @@ def generate_select_query(config_data, source_table, db_type='MySQL'):
continue

source_col = mapping['source']
# Apply TRIM at source for MSSQL to handle CHAR padding
if db_type == 'Microsoft SQL Server' and 'TRIM' in mapping.get('transformers', []):

# Special handling for SQL Server text columns
if db_type == 'Microsoft SQL Server':
col_expr = f'"{source_col}"'
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The column identifier uses double quotes directly from the source column name without sanitization. If the source column name contains malicious content (e.g., special characters like double quotes), this could lead to SQL injection. Consider using SQLAlchemy's identifier quoting mechanism or validating/sanitizing column names before using them in queries.

Copilot uses AI. Check for mistakes.

# Apply TRIM if specified in transformers
if 'TRIM' in mapping.get('transformers', []):
col_expr = f'TRIM({col_expr})'

# Clean non-breaking spaces and problematic characters for VARCHAR/NVARCHAR/TEXT columns
# REPLACE(col, CHAR(160), ' ') -> replace nbsp with regular space
# REPLACE(col, CHAR(0), '') -> remove null bytes
col_expr = f'REPLACE(REPLACE({col_expr}, CHAR(160), \' \'), CHAR(0), \'\')'
Comment on lines +73 to +81
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The REPLACE operations are unconditionally applied to all columns for SQL Server, regardless of the column's data type. This could cause issues with numeric, date, or binary columns where REPLACE operations on character data are not appropriate. Consider checking the column data type or restricting this transformation to text/varchar column types only.

Suggested change
# Apply TRIM if specified in transformers
if 'TRIM' in mapping.get('transformers', []):
col_expr = f'TRIM({col_expr})'
# Clean non-breaking spaces and problematic characters for VARCHAR/NVARCHAR/TEXT columns
# REPLACE(col, CHAR(160), ' ') -> replace nbsp with regular space
# REPLACE(col, CHAR(0), '') -> remove null bytes
col_expr = f'REPLACE(REPLACE({col_expr}, CHAR(160), \' \'), CHAR(0), \'\')'
# Determine if the source column is a text-like type where string REPLACE is appropriate.
# This relies on an optional 'source_type' field in the mapping (e.g. 'VARCHAR(50)', 'NVARCHAR', 'TEXT').
source_type_raw = str(mapping.get('source_type', '') or '')
source_type_upper = source_type_raw.upper()
text_type_prefixes = ('CHAR', 'NCHAR', 'VARCHAR', 'NVARCHAR')
is_text_column = (
any(source_type_upper.startswith(prefix) for prefix in text_type_prefixes)
or source_type_upper in ('TEXT', 'NTEXT')
)
# Apply TRIM if specified in transformers
if 'TRIM' in mapping.get('transformers', []):
col_expr = f'TRIM({col_expr})'
# Clean non-breaking spaces and problematic characters only for text-like columns.
# REPLACE(col, CHAR(160), ' ') -> replace nbsp with regular space
# REPLACE(col, CHAR(0), '') -> remove null bytes
if is_text_column:
col_expr = f'REPLACE(REPLACE({col_expr}, CHAR(160), \' \'), CHAR(0), \'\')'

Copilot uses AI. Check for mistakes.

selected_cols.append(f'{col_expr} AS "{source_col}"')
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SQL injection vulnerability in the AS clause. The source column name is used directly in quotes without sanitization. If a malicious column name contains double quotes or other SQL special characters, this could be exploited. Consider using SQLAlchemy's identifier quoting mechanism or validating/sanitizing column names.

Copilot uses AI. Check for mistakes.
elif 'TRIM' in mapping.get('transformers', []):
# Other databases: just apply TRIM if needed
selected_cols.append(f'TRIM("{source_col}") AS "{source_col}"')
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar SQL injection vulnerability as line 72. The column identifier uses double quotes directly from the source column name in both the TRIM function and the AS clause. Consider using SQLAlchemy's identifier quoting mechanism or validating/sanitizing column names.

Copilot uses AI. Check for mistakes.
else:
selected_cols.append(f'"{source_col}"')
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to line 72, this column identifier uses double quotes directly from the source column name without sanitization, which could lead to SQL injection if the source column name contains malicious content. Consider using SQLAlchemy's identifier quoting mechanism or validating/sanitizing column names.

Copilot uses AI. Check for mistakes.
Expand Down Expand Up @@ -293,19 +309,56 @@ def render_migration_engine_page():
src_sel = st.selectbox("Source Profile", ds_options, key="src_sel")
st.session_state.migration_src_profile = src_sel

charset_options = ["utf8mb4 (Default)", "tis620 (Thai Legacy)", "latin1 (Raw Bytes)"]
# Get source DB type to show appropriate charset options
src_db_type = None
if src_sel != "Select Profile...":
row = datasources[datasources['name'] == src_sel].iloc[0]
ds_detail = db.get_datasource_by_id(int(row['id']))
src_db_type = ds_detail['db_type']

# Show charset options based on DB type
if src_db_type == 'Microsoft SQL Server':
charset_options = [
"utf8 (Default - Modern)",
"cp874 (Thai Windows Codepage - แนะนำสำหรับข้อมูลไทยเก่า)",
"latin1 (Raw Bytes)"
]
help_text = "SQL Server: ใช้ cp874 สำหรับข้อมูลไทยแบบเก่า"
elif src_db_type == 'MySQL':
charset_options = [
"utf8mb4 (Default)",
"tis620 (Thai Legacy)",
"latin1 (Raw Bytes)"
]
help_text = "MySQL: ใช้ tis620 ถ้าภาษาไทยเพี้ยน"
else:
charset_options = [
"utf8 (Default)",
"latin1 (Raw Bytes)"
]
help_text = "เลือก charset ตามฐานข้อมูลต้นทาง"
Comment on lines +334 to +339
Copy link

Copilot AI Feb 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When no source profile is selected (src_db_type is None), the charset selection defaults to "utf8 (Default)" and "latin1 (Raw Bytes)" options. However, the charset_map on line 352 maps "utf8 (Default)" to None, which might not be appropriate for all database types. When a user later selects a source profile (e.g., MySQL), they won't see MySQL-specific options unless they change the selection again. Consider disabling charset selection until a source profile is selected or updating the options dynamically.

Copilot uses AI. Check for mistakes.

src_charset_sel = st.selectbox(
"Source Charset (ถ้าภาษาไทยเพี้ยนให้ลอง tis620)",
charset_options,
key="src_charset_sel"
"Source Charset",
charset_options,
key="src_charset_sel",
help=help_text
)

# Map selection to actual charset value
charset_map = {
"utf8mb4 (Default)": None,
"utf8 (Default - Modern)": None,
"utf8 (Default)": None,
"tis620 (Thai Legacy)": "tis620",
"cp874 (Thai Windows Codepage - แนะนำสำหรับข้อมูลไทยเก่า)": "cp874",
"latin1 (Raw Bytes)": "latin1"
}
st.session_state.src_charset = charset_map.get(src_charset_sel)

if src_charset_sel.startswith("cp874"):
st.info("💡 **cp874** จะแก้ปัญหา non-breaking space และตัวอักษรไทยเก่าใน SQL Server")

if src_sel != "Select Profile...":
if st.button("🔍 Test Source"):
with st.spinner("Connecting..."):
Expand Down
Loading