mercredi 18 novembre 2020

Lexical Analyzer: Differentiation being Machine Code and Number Line

I am working on a lexical Analyzer that will read Cm assembly and create tokens, and for the most part it is going perfectly. I have but 2 small problems. First and foremost is that I am having difficulty differentiating between Machine code, and line number. As I have it right not, if the first character is a digit, and it, and the length of the string is <2, I assign it as a line number. The issue here, is that that is also the description for the Machine Code. In actuality, the line number can surpass 2, and the machine code doesn't necessarily have to start with a digit, since it is in hexadecimal. I am having a great difficulty coming up with conditions that will satisfy these. The second issue I am having is a very minor one, when it comes to comments, it will read every character except the last one. Please find a sample input as well as my code below. Thank you all very much for the help, it is more than greatly appreciated!

import java.util.List;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList; 


public class Lexer  
{ 
    
    private static File srcFile = null ; 
    private static String srcFilename = "<srcFilename>"; 
    
    
    
    public static enum Type
    { 
        //This Assembly code has 5 token types (I am a little confused about this, the documentation released does not specify or at least 
        //go into great detail about about addresses and offsets, and even mentions that they are excluded from Cm Assembly, so I'm not sure
        //what to do for those
        MNUMONIC_NAMES, LABELS, ADDRESSES, OFFSETS, COMMENTS, LINE_NUMBER;
    } 
    
    
    //This class creates an object of type token
    public static class Token
    { 
        public final Type t; 
        public final String c; 
        
        //constructor (set type)
        public Token(Type t, String c) 
        { 
            this.t = t; 
            this.c = c;
        } 
        
        //toString
        public String toString() 
        { 
            if(t == Type.MNUMONIC_NAMES) 
            { 
                return "MNUMONIC_NAMES<" + c + ">";
            } 
            
            if(t == Type.LABELS) 
            { 
                return "LABELS<" + c + ">";
            }  
            
            if(t == Type.ADDRESSES) 
            { 
                return "ADDRESSES<" + c + ">";
            } 
            
            if(t == Type.OFFSETS) 
            { 
                return "OFFSETS<" + c + ">";
            } 
            
            if(t == Type.COMMENTS) 
            { 
                return "COMMENTS<" + c + ">";
            } 
            if(t == Type.LINE_NUMBER) 
            { 
                return "LINE_NUMBER<" + c + ">";
            }
            
            return t.toString();
        }
    } 
    
    //Given a String and an index, get the word/atom starting at index (this will be used for mnemonics and labels, 
    //since they are both single words
    public static String getAtom(String s, int i) 
    { 
        int j = i; 
        for(;j < s.length(); ) 
        { 
            //while character is a letter, continue
            if(Character.isLetter(s.charAt(j)) || Character.isDigit(s.charAt(j)) ) 
            { 
                j++;
            } 
            else 
            { 
                return s.substring(i, j);
            }
        } 
        return s.substring(i,j);
    }  
    
    //Given a String and an index, get the comment starting at index
    //I am having trouble with this one, it returns every word except the last letter of the last word 
    //If I remove the -1 from the length, it goes out of bounds.

        public static String getComment(String s, int i) 
        { 
            return s.substring( i , (s.length()-1) );
        }  
        
    
 
    //method creates and returns a List of Tokens
    public static List<Token> lex(String input)
    {
        List<Token> result = new ArrayList<Token>(); 
        for(int i = 0;i < input.length();) 
        { 
            switch(input.charAt(i)) 
            {  
                //case comment
                case ';': 
                    //System.out.println(input);
                    String comment = getComment(input, i); 
                    i+=comment.length();
                    result.add(new Token(Type.COMMENTS, comment)); 
                
                //if it is a number
                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9': 
                    String number = getAtom(input, i);
                    i+=number.length();
                    if(number.length() < 4) 
                    { 
                        result.add(new Token(Type.LINE_NUMBER, number));
                    } 
                    else 
                    { 
                        result.add(new Token(Type.ADDRESSES, number));
                    }
                    
                    
                //if not a comment it is either a label or mnemonic (still uncertain about addresses and offsets)   
                default: 
                    //if white space, continue
                    if(Character.isWhitespace(input.charAt(i))) 
                    { 
                        i++;
                    } 
                    //now check for the atom
                    else 
                    { 
                        String atom = getAtom(input, i); 
                        i += atom.length(); 
                        //check if it is a mnemonic (as specified by doc 3, pages 7 -> 8)
                        if(Check.isMnem(atom)) 
                        { 
                            result.add(new Token(Type.MNUMONIC_NAMES, atom));
                        } 
                        //else it is a label
                        else 
                        { 
                            result.add(new Token(Type.LABELS, atom));
                        }
                    } 
                    break;
            }
        } 
        //return list
        return result;
    } 
    
    
    
    
    //main just to test functionality, will run in terminal/command line
    public static void main(String[] args) throws IOException {
        /*
        if(args.length < 1) {
            System.out.println("Usage: java Lexer \"((some Scheme) (code to) lex)\".");
            return;
        }
        if (args[0] != null) 
        { 
            //check <src> 
            srcFilename = args[0]; 
            System.out.println("charcount: srcFilename '" + srcFilename + "'"); 
            srcFile = new File(srcFilename); 
            if(!srcFile.canRead()) 
            { 
                System.out.println("charcount: cannot open srcFile '" + srcFilename + "'"); 
                return;
            }
        } 
        else 
        { 
            System.out.println("charcount: [OK] srcFilename = '" + srcFilename + "'"); 
            
        }  
        */ 
        
        srcFilename = "C:\\Users\\abdcg\\Desktop\\School\\Concordia\\Semester 4\\SOEN 341\\Project B\\Sprint 1\\Lexer test\\Test 2.txt"; 
        srcFile = new File(srcFilename);
        
        //Scanner scanny = new Scanner(srcFile); 
        FileReader fr  = new FileReader(srcFile); 
        BufferedReader br = new BufferedReader(fr);
        String line; 
        while((line = br.readLine()) != null) 
        { 
             List<Token> tokens = lex(line);
             for(Token t : tokens) {
                 System.out.println(t);
             }
        }
        
        //while scanner hasNext(), send the entire line to lex
        /*
        while(scanny.hasNext()) 
        { 
             List<Token> tokens = lex(scanny.nextLine());
             for(Token t : tokens) {
                 System.out.println(t);
             }
        } 
        */
        
       
    }
    
}
Line Addr Machine Code  Label         Assembly Code        Comments        
1    0000 00                          halt      
2    0001 01                          pop       
3    0002 02                          dup       
4    0003 03                          exit      
5    0004 04                          ret       
6    0005 0C                          not       
7    0006 0D                          and       
8    0007 0E                          or        
9    0008 0F                          xor       
10   0009 10                          neg       
11   000A 11                          inc       
12   000B 12                          dec       
13   000C 13                          add       
14   000D 14                          sub       
15   000E 15                          mul       
16   000F 16                          div       
17   0010 17                          rem       
18   0011 18                          shl       
19   0012 19                          shr       
20   0013 1A                          teq       
21   0014 1B                          tne       
22   0015 1C                          tlt       
23   0016 1D                          tgt       
24   0017 1E                          tle       
25   0018 1F                          tge       
26   0019 00                          halt

Aucun commentaire:

Enregistrer un commentaire